{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "e:\\python\\lib\\site-packages\\numpy\\_distributor_init.py:30: UserWarning: loaded more than 1 DLL from .libs:\n",
      "e:\\python\\lib\\site-packages\\numpy\\.libs\\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll\n",
      "e:\\python\\lib\\site-packages\\numpy\\.libs\\libopenblas64__v0.3.21-gcc_10_3_0.dll\n",
      "  warnings.warn(\"loaded more than 1 DLL from .libs:\"\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import warnings\n",
    "import sys\n",
    "import os\n",
    "sys.path.append(\"../\")\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "from Utils.utils import ReverseDic\n",
    "import Utils.utils as utils"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1.People处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>resumeId</th>\n",
       "      <th>username</th>\n",
       "      <th>gender</th>\n",
       "      <th>jobStatus</th>\n",
       "      <th>exp</th>\n",
       "      <th>expectPosition</th>\n",
       "      <th>willSalaryStart</th>\n",
       "      <th>willSalaryEnd</th>\n",
       "      <th>city</th>\n",
       "      <th>publishTime</th>\n",
       "      <th>updateTime</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1574625077318778880</td>\n",
       "      <td>谢女士</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>无经验</td>\n",
       "      <td>[\"Hadoop大数据开发工程师\",\"数据分析师\"]</td>\n",
       "      <td>9000</td>\n",
       "      <td>11000</td>\n",
       "      <td>[\"广东省\",\"深圳市\",\"南山区\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2022-09-27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1573938917198135296</td>\n",
       "      <td>李女士</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>无经验</td>\n",
       "      <td>[\"Hadoop大数据开发工程师\"]</td>\n",
       "      <td>10000</td>\n",
       "      <td>15000</td>\n",
       "      <td>[\"广东省\",\"广州市\",\"番禺区\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2022-09-27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1569987734318219264</td>\n",
       "      <td>马先生</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1年工作经验</td>\n",
       "      <td>[\"数据分析师\",\"数据挖掘工程师\",\"图像处理工程师\"]</td>\n",
       "      <td>8000</td>\n",
       "      <td>12000</td>\n",
       "      <td>[\"广东省\",\"广州市\",\"天河区\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2022-09-14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1569514123790778368</td>\n",
       "      <td>杨先生</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>无经验</td>\n",
       "      <td>[\"数据分析师\"]</td>\n",
       "      <td>7000</td>\n",
       "      <td>10000</td>\n",
       "      <td>[\"湖北省\",\"武汉市\",\"洪山区\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2022-09-13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1562334736783900672</td>\n",
       "      <td>黄先生</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>无经验</td>\n",
       "      <td>[\"数据分析师\",\"数据挖掘工程师\"]</td>\n",
       "      <td>4000</td>\n",
       "      <td>8000</td>\n",
       "      <td>[\"广东省\",\"广州市\",\"天河区\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2022-08-24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10923</th>\n",
       "      <td>7539911466257411604</td>\n",
       "      <td>林女士</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"数据分析师\",\"数据挖掘工程师\"]</td>\n",
       "      <td>4000</td>\n",
       "      <td>6000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10924</th>\n",
       "      <td>7539911474847346196</td>\n",
       "      <td>易女士</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"数据分析师\",\"数据挖掘工程师\"]</td>\n",
       "      <td>4000</td>\n",
       "      <td>6000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10925</th>\n",
       "      <td>7539911483437280788</td>\n",
       "      <td>林女士</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"数据分析师\",\"数据挖掘工程师\"]</td>\n",
       "      <td>4000</td>\n",
       "      <td>6000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10926</th>\n",
       "      <td>7539911492027215380</td>\n",
       "      <td>李女士</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"数据分析师\",\"数据挖掘工程师\"]</td>\n",
       "      <td>4000</td>\n",
       "      <td>6000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10927</th>\n",
       "      <td>7539911500617149972</td>\n",
       "      <td>范先生</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"数据分析师\",\"数据挖掘工程师\"]</td>\n",
       "      <td>4000</td>\n",
       "      <td>6000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10928 rows × 11 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                  resumeId username  gender  jobStatus     exp  \\\n",
       "0      1574625077318778880      谢女士     1.0          1     无经验   \n",
       "1      1573938917198135296      李女士     1.0          1     无经验   \n",
       "2      1569987734318219264      马先生     0.0          1  1年工作经验   \n",
       "3      1569514123790778368      杨先生     0.0          0     无经验   \n",
       "4      1562334736783900672      黄先生     0.0          1     无经验   \n",
       "...                    ...      ...     ...        ...     ...   \n",
       "10923  7539911466257411604      林女士     1.0          1     NaN   \n",
       "10924  7539911474847346196      易女士     1.0          1     NaN   \n",
       "10925  7539911483437280788      林女士     1.0          1     NaN   \n",
       "10926  7539911492027215380      李女士     1.0          1     NaN   \n",
       "10927  7539911500617149972      范先生     0.0          1     NaN   \n",
       "\n",
       "                      expectPosition  willSalaryStart  willSalaryEnd  \\\n",
       "0         [\"Hadoop大数据开发工程师\",\"数据分析师\"]             9000          11000   \n",
       "1                 [\"Hadoop大数据开发工程师\"]            10000          15000   \n",
       "2      [\"数据分析师\",\"数据挖掘工程师\",\"图像处理工程师\"]             8000          12000   \n",
       "3                          [\"数据分析师\"]             7000          10000   \n",
       "4                [\"数据分析师\",\"数据挖掘工程师\"]             4000           8000   \n",
       "...                              ...              ...            ...   \n",
       "10923            [\"数据分析师\",\"数据挖掘工程师\"]             4000           6000   \n",
       "10924            [\"数据分析师\",\"数据挖掘工程师\"]             4000           6000   \n",
       "10925            [\"数据分析师\",\"数据挖掘工程师\"]             4000           6000   \n",
       "10926            [\"数据分析师\",\"数据挖掘工程师\"]             4000           6000   \n",
       "10927            [\"数据分析师\",\"数据挖掘工程师\"]             4000           6000   \n",
       "\n",
       "                      city publishTime  updateTime  \n",
       "0      [\"广东省\",\"深圳市\",\"南山区\"]         NaN  2022-09-27  \n",
       "1      [\"广东省\",\"广州市\",\"番禺区\"]         NaN  2022-09-27  \n",
       "2      [\"广东省\",\"广州市\",\"天河区\"]         NaN  2022-09-14  \n",
       "3      [\"湖北省\",\"武汉市\",\"洪山区\"]         NaN  2022-09-13  \n",
       "4      [\"广东省\",\"广州市\",\"天河区\"]         NaN  2022-08-24  \n",
       "...                    ...         ...         ...  \n",
       "10923                  NaN         NaN         NaN  \n",
       "10924                  NaN         NaN         NaN  \n",
       "10925                  NaN         NaN         NaN  \n",
       "10926                  NaN         NaN         NaN  \n",
       "10927                  NaN         NaN         NaN  \n",
       "\n",
       "[10928 rows x 11 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "People = pd.read_csv(\"../Data/OriginData/FindPeople.csv\")\n",
    "PeopleDetail = pd.read_csv(\"../Data/OriginData/FindPeopleDetail.csv\")\n",
    "People"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]                                                                                                                                                                                                                                                                                                10781\n",
       "['广州泰迪智能科技有限公司|大数据开发相关课程|2021-09-01|2021-12-31|培训内容：Java程序设计，Linux操作系统，SQL基础，Hadoop大数据基础，Hive数据仓库，ZooKeeper分布式服务框架，HBase非关系型数据库，Scala基础，Spark大数据技术与应用，Flume数据采集，Kafka大数据数据流处理，Flink大数据实时处理，分布式文件搜索ElasticSearch。']                                                                                    1\n",
       "['广东泰迪智能科技股份有限公司|Python数据分析|2021-08-30|2021-12-29|在此次培训经历中,学习的主要课程有 Python编程基础、 Python数据分析与应用、 Python数据可视化、 Python网络爬虫、 Python机器学习、综合项目实战、深度学习原理及编程实现、计算机视觉及项目实战、自然语言处理及项目实战、 MYSQL数据库、MYSQL数据库项目实战、 Linux基础、 Hadoop、hive、 spark。重点学习Python基础与数据分析、应用']                                               1\n",
       "['广东泰迪智能科技有限公司|大数据开发|2021-09-01|2021-12-30|培训内容：学习Java 基础，Linux 操作系统，SQL 基础，Hadoop，Hive，Zookeeper，Hbase，Scala 基础，Spark，\\nFlume，Kafka，Flink，ES等大数据开发技术。\\n']                                                                                                                                            1\n",
       "['泰迪智能科技|大数据分析|2021-08-31|2021-12-31|在泰迪智能科技公司进行为期四个月的大数据分析培训及实训，获益良多。']                                                                                                                                                                                                                              1\n",
       "                                                                                                                                                                                                                                                                                                  ...  \n",
       "['泰迪智能科技有限公司|大数据开发|2021-08-31|2021-12-31|在泰迪我学习到了跟大量跟大数据相关的新知识']                                                                                                                                                                                                                                      1\n",
       "['泰迪智能科技公司|大数据分析|2021-08-31|2021-12-30|参加广州泰迪智能科技有限公司组织举办的为期4个月的“大数据分析及工程实践”培训。通过此次培训，我对数据分析技术有了更深刻的理解和认识，对数据分析工作内容有了更深的体会。培训内容如下：\\n（1）大数据分析理论：数据探索、数据预处理、爬虫工程、机器学习（KNN近邻算法、决策树算法、K-mean聚类分析等算法）、深度学习、数据库等基本操作和技术。\\n（2）主要语言学习：Python，Mysql。\\n（3）全真企业案例实训：教育平台的线上课程智能推荐策略，餐饮企业综合数据分析，网站会员流失预测']        1\n",
       "['广东泰迪智能科技公司|数据分析|2021-09-01|2021-12-30|主要内容：python数据分析、python数据可视化、机器学习、TensorFlow深度学习、MySQL数据库等。']                                                                                                                                                                                                  1\n",
       "['广东泰迪智能科技公司|数据分析|2021-08-31|2021-12-30|主要内容：python数据分析、python数据可视化、机器学习、TensorFlow深度学习、MySQL数据库等。']                                                                                                                                                                                                  1\n",
       "['泰迪智能科技股份有限公司|python数据分析|2021-11-08|2021-12-30|1. 基于 requests 库、Xpath、Selenium 库等的网络爬虫；\\n2. 基于决策树、关联规则、聚类模型等的机器学习；\\n3. 基于 tensorflow 框架的 CNN、RNN、LSTM、GAN 等神经网络模型。']                                                                                                                                 1\n",
       "Name: trainingExperienceList, Length: 93, dtype: int64"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "PeopleDetail['trainingExperienceList'].value_counts()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 总体查看"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 存在重复数据，进行删除"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 由于重复数据是相同的，因此\n",
    "People.drop_duplicates(inplace=True)\n",
    "PeopleDetail.drop_duplicates(inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "41"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "missing_data = set(People.resumeId.values).difference(set(PeopleDetail.resumeId.values))\n",
    "len(missing_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>resumeId</th>\n",
       "      <th>username</th>\n",
       "      <th>gender</th>\n",
       "      <th>jobStatus</th>\n",
       "      <th>exp</th>\n",
       "      <th>expectPosition</th>\n",
       "      <th>willSalaryStart</th>\n",
       "      <th>willSalaryEnd</th>\n",
       "      <th>city</th>\n",
       "      <th>publishTime</th>\n",
       "      <th>updateTime</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1574625077318778880</td>\n",
       "      <td>谢女士</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>无经验</td>\n",
       "      <td>[\"Hadoop大数据开发工程师\",\"数据分析师\"]</td>\n",
       "      <td>9000</td>\n",
       "      <td>11000</td>\n",
       "      <td>[\"广东省\",\"深圳市\",\"南山区\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2022-09-27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1573938917198135296</td>\n",
       "      <td>李女士</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>无经验</td>\n",
       "      <td>[\"Hadoop大数据开发工程师\"]</td>\n",
       "      <td>10000</td>\n",
       "      <td>15000</td>\n",
       "      <td>[\"广东省\",\"广州市\",\"番禺区\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2022-09-27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1569987734318219264</td>\n",
       "      <td>马先生</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1年工作经验</td>\n",
       "      <td>[\"数据分析师\",\"数据挖掘工程师\",\"图像处理工程师\"]</td>\n",
       "      <td>8000</td>\n",
       "      <td>12000</td>\n",
       "      <td>[\"广东省\",\"广州市\",\"天河区\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2022-09-14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1569514123790778368</td>\n",
       "      <td>杨先生</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>无经验</td>\n",
       "      <td>[\"数据分析师\"]</td>\n",
       "      <td>7000</td>\n",
       "      <td>10000</td>\n",
       "      <td>[\"湖北省\",\"武汉市\",\"洪山区\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2022-09-13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1562334736783900672</td>\n",
       "      <td>黄先生</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>无经验</td>\n",
       "      <td>[\"数据分析师\",\"数据挖掘工程师\"]</td>\n",
       "      <td>4000</td>\n",
       "      <td>8000</td>\n",
       "      <td>[\"广东省\",\"广州市\",\"天河区\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2022-08-24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8276</th>\n",
       "      <td>7539911466257411604</td>\n",
       "      <td>林女士</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"数据分析师\",\"数据挖掘工程师\"]</td>\n",
       "      <td>4000</td>\n",
       "      <td>6000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8277</th>\n",
       "      <td>7539911474847346196</td>\n",
       "      <td>易女士</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"数据分析师\",\"数据挖掘工程师\"]</td>\n",
       "      <td>4000</td>\n",
       "      <td>6000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8278</th>\n",
       "      <td>7539911483437280788</td>\n",
       "      <td>林女士</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"数据分析师\",\"数据挖掘工程师\"]</td>\n",
       "      <td>4000</td>\n",
       "      <td>6000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8279</th>\n",
       "      <td>7539911492027215380</td>\n",
       "      <td>李女士</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"数据分析师\",\"数据挖掘工程师\"]</td>\n",
       "      <td>4000</td>\n",
       "      <td>6000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8280</th>\n",
       "      <td>7539911500617149972</td>\n",
       "      <td>范先生</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"数据分析师\",\"数据挖掘工程师\"]</td>\n",
       "      <td>4000</td>\n",
       "      <td>6000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8281 rows × 11 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                 resumeId username  gender  jobStatus     exp  \\\n",
       "0     1574625077318778880      谢女士     1.0          1     无经验   \n",
       "1     1573938917198135296      李女士     1.0          1     无经验   \n",
       "2     1569987734318219264      马先生     0.0          1  1年工作经验   \n",
       "3     1569514123790778368      杨先生     0.0          0     无经验   \n",
       "4     1562334736783900672      黄先生     0.0          1     无经验   \n",
       "...                   ...      ...     ...        ...     ...   \n",
       "8276  7539911466257411604      林女士     1.0          1     NaN   \n",
       "8277  7539911474847346196      易女士     1.0          1     NaN   \n",
       "8278  7539911483437280788      林女士     1.0          1     NaN   \n",
       "8279  7539911492027215380      李女士     1.0          1     NaN   \n",
       "8280  7539911500617149972      范先生     0.0          1     NaN   \n",
       "\n",
       "                     expectPosition  willSalaryStart  willSalaryEnd  \\\n",
       "0        [\"Hadoop大数据开发工程师\",\"数据分析师\"]             9000          11000   \n",
       "1                [\"Hadoop大数据开发工程师\"]            10000          15000   \n",
       "2     [\"数据分析师\",\"数据挖掘工程师\",\"图像处理工程师\"]             8000          12000   \n",
       "3                         [\"数据分析师\"]             7000          10000   \n",
       "4               [\"数据分析师\",\"数据挖掘工程师\"]             4000           8000   \n",
       "...                             ...              ...            ...   \n",
       "8276            [\"数据分析师\",\"数据挖掘工程师\"]             4000           6000   \n",
       "8277            [\"数据分析师\",\"数据挖掘工程师\"]             4000           6000   \n",
       "8278            [\"数据分析师\",\"数据挖掘工程师\"]             4000           6000   \n",
       "8279            [\"数据分析师\",\"数据挖掘工程师\"]             4000           6000   \n",
       "8280            [\"数据分析师\",\"数据挖掘工程师\"]             4000           6000   \n",
       "\n",
       "                     city publishTime  updateTime  \n",
       "0     [\"广东省\",\"深圳市\",\"南山区\"]         NaN  2022-09-27  \n",
       "1     [\"广东省\",\"广州市\",\"番禺区\"]         NaN  2022-09-27  \n",
       "2     [\"广东省\",\"广州市\",\"天河区\"]         NaN  2022-09-14  \n",
       "3     [\"湖北省\",\"武汉市\",\"洪山区\"]         NaN  2022-09-13  \n",
       "4     [\"广东省\",\"广州市\",\"天河区\"]         NaN  2022-08-24  \n",
       "...                   ...         ...         ...  \n",
       "8276                  NaN         NaN         NaN  \n",
       "8277                  NaN         NaN         NaN  \n",
       "8278                  NaN         NaN         NaN  \n",
       "8279                  NaN         NaN         NaN  \n",
       "8280                  NaN         NaN         NaN  \n",
       "\n",
       "[8281 rows x 11 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "People = People[People['resumeId'].map(lambda x :  x not in list(missing_data))].reset_index(drop=True)\n",
    "People"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(set(), set())"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "set(People['resumeId']).difference(PeopleDetail['resumeId']),set(PeopleDetail['resumeId']).difference(People['resumeId'])"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.性别处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1615\n"
     ]
    }
   ],
   "source": [
    "print(People['gender'].notnull().sum())         # 非缺失值数量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "People['gender']=People['gender'].apply(lambda x:'女' if x>0 else'男')"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.就业状态处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    8237\n",
       "0      41\n",
       "5       2\n",
       "2       1\n",
       "Name: jobStatus, dtype: int64"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "People['jobStatus'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "jobStatusDic = {'毕业找工作':1, '无明确就业状态':0,\"暂不换工作\":5,\"在职看机会\":2}\n",
    "People['jobStatus'] = People['jobStatus'].map(utils.ReverseDic(jobStatusDic))"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.工作经验"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "存在大量缺失值，用无经验进行填充。\n",
    "\n",
    "经验对找工作是一个很重要的因素。因此缺失值用无经验进行填充"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "无经验          178\n",
       "1年工作经验        18\n",
       "4年工作经验         6\n",
       "3年工作经验         6\n",
       "10年以上工作经验      4\n",
       "2年工作经验         2\n",
       "6年工作经验         1\n",
       "5年工作经验         1\n",
       "Name: exp, dtype: int64"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "People['exp'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "expDic = {\"无经验\":0,\"2年工作经验\":2,\"5年工作经验\":5,\"6年工作经验\":6,\"1年工作经验\":1,\"4年工作经验\":4,\"10年以上工作经验\":10,\"3年工作经验\":3}\n",
    "People['exp'] = People['exp'].fillna(\"无经验\")\n",
    "People['exp'] = People['exp'].map(expDic)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.期望薪资处理\n",
    "\n",
    "加一个平均薪资指标"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "People['averageSalary'] = 0\n",
    "def f(row):\n",
    "    row['averageSalary'] = (row['willSalaryStart']+row['willSalaryEnd'])/2\n",
    "    return row\n",
    "\n",
    "People = People.apply(lambda row:f(row),axis=1) "
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5.期望城市处理\n",
    "\n",
    "有一个坏数据，处理掉。\n",
    "\n",
    "将city这一列拆分成省、市、区，并去除一些杂乱的符号"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[\"广东省\",\"广州市\",\"天河区\"]                                                                                                                            96\n",
       "[\"广东省\",\"广州市\",\"黄埔区\"]                                                                                                                            31\n",
       "[\"北京市\",\"北京市\",\"东城区\"]                                                                                                                            10\n",
       "[\"广东省\",\"深圳市\",\"宝安区\"]                                                                                                                             8\n",
       "[\"广东省\",\"深圳市\",\"南山区\"]                                                                                                                             6\n",
       "[\"广东省\",\"广州市\",\"白云区\"]                                                                                                                             6\n",
       "[\"广东省\",\"广州市\",\"番禺区\"]                                                                                                                             5\n",
       "[\"广东省\",\"广州市\",\"荔湾区\"]                                                                                                                             5\n",
       "[\"广东省\",\"广州市\",\"越秀区\"]                                                                                                                             4\n",
       "[\"广东省\",\"深圳市\",\"龙岗区\"]                                                                                                                             4\n",
       "[\"广东省\",\"广州市\",\"海珠区\"]                                                                                                                             4\n",
       "[\"天津市\",\"天津市\",\"河东区\"]                                                                                                                             3\n",
       "[\"天津市\",\"天津市\",\"和平区\"]                                                                                                                             2\n",
       "[\"广东省\",\"深圳市\",\"福田区\"]                                                                                                                             2\n",
       "[\"重庆市\",\"重庆市\",\"渝中区\"]                                                                                                                             2\n",
       "[\"广东省\",\"深圳市\",\"罗湖区\"]                                                                                                                             2\n",
       "[\"广东省\",\"佛山市\",\"南海区\"]                                                                                                                             2\n",
       "[\"北京市\",\"北京市\",\"海淀区\"]                                                                                                                             2\n",
       "[\"河北省\",\"秦皇岛市\",\"北戴河区\"]                                                                                                                           2\n",
       "[\"山西省\",\"长治市\",\"潞城区\"]                                                                                                                             2\n",
       "[\"四川省\",\"成都市\",\"武侯区\"]                                                                                                                             1\n",
       "[\"广东省\",\"东莞市\",\"东莞\"]                                                                                                                              1\n",
       "[\"陕西省\",\"西安市\",\"雁塔区\"]                                                                                                                             1\n",
       "[\"辽宁省\",\"沈阳市\",\"和平区\"]                                                                                                                             1\n",
       "[\"广东省\",\"佛山市\",\"顺德区\"]                                                                                                                             1\n",
       "\"\\\"\\\\\\\"\\\\\\\\\\\\\\\"[\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\"广东省\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\",\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\"广州市\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\",\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\"越秀区\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\"]\\\\\\\\\\\\\\\"\\\\\\\"\\\"\"     1\n",
       "[\"湖北省\",\"武汉市\",\"武昌区\"]                                                                                                                             1\n",
       "[\"浙江省\",\"杭州市\",\"滨江区\"]                                                                                                                             1\n",
       "[\"广西壮族自治区\",\"南宁市\",\"青秀区\"]                                                                                                                         1\n",
       "[\"广东省\",\"深圳市\",\"盐田区\"]                                                                                                                             1\n",
       "[\"上海市\",\"上海市\",\"黄浦区\"]                                                                                                                             1\n",
       "[\"山东省\",\"菏泽市\",\"曹县\"]                                                                                                                              1\n",
       "[\"天津市\",\"天津市\",\"河西区\"]                                                                                                                             1\n",
       "[\"浙江省\",\"杭州市\",\"拱墅区\"]                                                                                                                             1\n",
       "[\"湖北省\",\"武汉市\",\"洪山区\"]                                                                                                                             1\n",
       "[\"湖南省\",\"湘潭市\",\"湘潭县\"]                                                                                                                             1\n",
       "[\"江苏省\",\"南京市\",\"江宁区\"]                                                                                                                             1\n",
       "[\"广东省\",\"佛山市\",\"禅城区\"]                                                                                                                             1\n",
       "Name: city, dtype: int64"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "People['city'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def f(x):\n",
    "    if isinstance(x,str):\n",
    "        if \"\\\\\" in x:\n",
    "            return str([\"广东省\",\"广州市\",\"越秀区\"])\n",
    "    return x\n",
    "\n",
    "People['city'] = People['city'].map(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "split_df = People['city'].str.split(',', expand=True)\n",
    "del People['city']\n",
    "split_df = split_df.rename(columns={0: \"province\",1:\"city\",2:\"region\"})\n",
    "People = pd.concat([People, split_df], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>resumeId</th>\n",
       "      <th>username</th>\n",
       "      <th>gender</th>\n",
       "      <th>jobStatus</th>\n",
       "      <th>exp</th>\n",
       "      <th>expectPosition</th>\n",
       "      <th>willSalaryStart</th>\n",
       "      <th>willSalaryEnd</th>\n",
       "      <th>publishTime</th>\n",
       "      <th>updateTime</th>\n",
       "      <th>averageSalary</th>\n",
       "      <th>province</th>\n",
       "      <th>city</th>\n",
       "      <th>region</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1574625077318778880</td>\n",
       "      <td>谢女士</td>\n",
       "      <td>女</td>\n",
       "      <td>毕业找工作</td>\n",
       "      <td>0</td>\n",
       "      <td>[\"Hadoop大数据开发工程师\",\"数据分析师\"]</td>\n",
       "      <td>9000</td>\n",
       "      <td>11000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2022-09-27</td>\n",
       "      <td>10000.0</td>\n",
       "      <td>[\"广东省\"</td>\n",
       "      <td>\"深圳市\"</td>\n",
       "      <td>\"南山区\"]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1573938917198135296</td>\n",
       "      <td>李女士</td>\n",
       "      <td>女</td>\n",
       "      <td>毕业找工作</td>\n",
       "      <td>0</td>\n",
       "      <td>[\"Hadoop大数据开发工程师\"]</td>\n",
       "      <td>10000</td>\n",
       "      <td>15000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2022-09-27</td>\n",
       "      <td>12500.0</td>\n",
       "      <td>[\"广东省\"</td>\n",
       "      <td>\"广州市\"</td>\n",
       "      <td>\"番禺区\"]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1569987734318219264</td>\n",
       "      <td>马先生</td>\n",
       "      <td>男</td>\n",
       "      <td>毕业找工作</td>\n",
       "      <td>1</td>\n",
       "      <td>[\"数据分析师\",\"数据挖掘工程师\",\"图像处理工程师\"]</td>\n",
       "      <td>8000</td>\n",
       "      <td>12000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2022-09-14</td>\n",
       "      <td>10000.0</td>\n",
       "      <td>[\"广东省\"</td>\n",
       "      <td>\"广州市\"</td>\n",
       "      <td>\"天河区\"]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1569514123790778368</td>\n",
       "      <td>杨先生</td>\n",
       "      <td>男</td>\n",
       "      <td>无明确就业状态</td>\n",
       "      <td>0</td>\n",
       "      <td>[\"数据分析师\"]</td>\n",
       "      <td>7000</td>\n",
       "      <td>10000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2022-09-13</td>\n",
       "      <td>8500.0</td>\n",
       "      <td>[\"湖北省\"</td>\n",
       "      <td>\"武汉市\"</td>\n",
       "      <td>\"洪山区\"]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1562334736783900672</td>\n",
       "      <td>黄先生</td>\n",
       "      <td>男</td>\n",
       "      <td>毕业找工作</td>\n",
       "      <td>0</td>\n",
       "      <td>[\"数据分析师\",\"数据挖掘工程师\"]</td>\n",
       "      <td>4000</td>\n",
       "      <td>8000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2022-08-24</td>\n",
       "      <td>6000.0</td>\n",
       "      <td>[\"广东省\"</td>\n",
       "      <td>\"广州市\"</td>\n",
       "      <td>\"天河区\"]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8276</th>\n",
       "      <td>7539911466257411604</td>\n",
       "      <td>林女士</td>\n",
       "      <td>女</td>\n",
       "      <td>毕业找工作</td>\n",
       "      <td>0</td>\n",
       "      <td>[\"数据分析师\",\"数据挖掘工程师\"]</td>\n",
       "      <td>4000</td>\n",
       "      <td>6000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8277</th>\n",
       "      <td>7539911474847346196</td>\n",
       "      <td>易女士</td>\n",
       "      <td>女</td>\n",
       "      <td>毕业找工作</td>\n",
       "      <td>0</td>\n",
       "      <td>[\"数据分析师\",\"数据挖掘工程师\"]</td>\n",
       "      <td>4000</td>\n",
       "      <td>6000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8278</th>\n",
       "      <td>7539911483437280788</td>\n",
       "      <td>林女士</td>\n",
       "      <td>女</td>\n",
       "      <td>毕业找工作</td>\n",
       "      <td>0</td>\n",
       "      <td>[\"数据分析师\",\"数据挖掘工程师\"]</td>\n",
       "      <td>4000</td>\n",
       "      <td>6000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8279</th>\n",
       "      <td>7539911492027215380</td>\n",
       "      <td>李女士</td>\n",
       "      <td>女</td>\n",
       "      <td>毕业找工作</td>\n",
       "      <td>0</td>\n",
       "      <td>[\"数据分析师\",\"数据挖掘工程师\"]</td>\n",
       "      <td>4000</td>\n",
       "      <td>6000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8280</th>\n",
       "      <td>7539911500617149972</td>\n",
       "      <td>范先生</td>\n",
       "      <td>男</td>\n",
       "      <td>毕业找工作</td>\n",
       "      <td>0</td>\n",
       "      <td>[\"数据分析师\",\"数据挖掘工程师\"]</td>\n",
       "      <td>4000</td>\n",
       "      <td>6000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8281 rows × 14 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                 resumeId username gender jobStatus  exp  \\\n",
       "0     1574625077318778880      谢女士      女     毕业找工作    0   \n",
       "1     1573938917198135296      李女士      女     毕业找工作    0   \n",
       "2     1569987734318219264      马先生      男     毕业找工作    1   \n",
       "3     1569514123790778368      杨先生      男   无明确就业状态    0   \n",
       "4     1562334736783900672      黄先生      男     毕业找工作    0   \n",
       "...                   ...      ...    ...       ...  ...   \n",
       "8276  7539911466257411604      林女士      女     毕业找工作    0   \n",
       "8277  7539911474847346196      易女士      女     毕业找工作    0   \n",
       "8278  7539911483437280788      林女士      女     毕业找工作    0   \n",
       "8279  7539911492027215380      李女士      女     毕业找工作    0   \n",
       "8280  7539911500617149972      范先生      男     毕业找工作    0   \n",
       "\n",
       "                     expectPosition  willSalaryStart  willSalaryEnd  \\\n",
       "0        [\"Hadoop大数据开发工程师\",\"数据分析师\"]             9000          11000   \n",
       "1                [\"Hadoop大数据开发工程师\"]            10000          15000   \n",
       "2     [\"数据分析师\",\"数据挖掘工程师\",\"图像处理工程师\"]             8000          12000   \n",
       "3                         [\"数据分析师\"]             7000          10000   \n",
       "4               [\"数据分析师\",\"数据挖掘工程师\"]             4000           8000   \n",
       "...                             ...              ...            ...   \n",
       "8276            [\"数据分析师\",\"数据挖掘工程师\"]             4000           6000   \n",
       "8277            [\"数据分析师\",\"数据挖掘工程师\"]             4000           6000   \n",
       "8278            [\"数据分析师\",\"数据挖掘工程师\"]             4000           6000   \n",
       "8279            [\"数据分析师\",\"数据挖掘工程师\"]             4000           6000   \n",
       "8280            [\"数据分析师\",\"数据挖掘工程师\"]             4000           6000   \n",
       "\n",
       "     publishTime  updateTime  averageSalary province   city  region  \n",
       "0            NaN  2022-09-27        10000.0   [\"广东省\"  \"深圳市\"  \"南山区\"]  \n",
       "1            NaN  2022-09-27        12500.0   [\"广东省\"  \"广州市\"  \"番禺区\"]  \n",
       "2            NaN  2022-09-14        10000.0   [\"广东省\"  \"广州市\"  \"天河区\"]  \n",
       "3            NaN  2022-09-13         8500.0   [\"湖北省\"  \"武汉市\"  \"洪山区\"]  \n",
       "4            NaN  2022-08-24         6000.0   [\"广东省\"  \"广州市\"  \"天河区\"]  \n",
       "...          ...         ...            ...      ...    ...     ...  \n",
       "8276         NaN         NaN         5000.0      NaN    NaN     NaN  \n",
       "8277         NaN         NaN         5000.0      NaN    NaN     NaN  \n",
       "8278         NaN         NaN         5000.0      NaN    NaN     NaN  \n",
       "8279         NaN         NaN         5000.0      NaN    NaN     NaN  \n",
       "8280         NaN         NaN         5000.0      NaN    NaN     NaN  \n",
       "\n",
       "[8281 rows x 14 columns]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "People"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "def f1(x):\n",
    "    if isinstance(x,str):\n",
    "        x = x.replace(\"\\\"\",\"\")\n",
    "        x = x.replace(\"[\",\"\")\n",
    "        x = x.replace(\"]\",\"\")\n",
    "    return x\n",
    "People['province'] = People['province'].map(f1)\n",
    "People['city'] = People['city'].map(f1)\n",
    "People['region'] = People['region'].map(f1)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 6.publishTime处理"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "publish_time只有20多条有数据，且为发布时间，用处不大,删掉"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2021-12-06    10\n",
       "2021-12-13     8\n",
       "2021-12-10     6\n",
       "2021-12-15     4\n",
       "2021-12-20     4\n",
       "2021-12-12     4\n",
       "2021-12-22     3\n",
       "2021-12-01     2\n",
       "2021-12-17     2\n",
       "2021-12-07     2\n",
       "2021-12-02     2\n",
       "2021-12-21     1\n",
       "2021-12-14     1\n",
       "2021-12-19     1\n",
       "2021-12-08     1\n",
       "2021-12-24     1\n",
       "2021-11-28     1\n",
       "2021-11-22     1\n",
       "Name: publishTime, dtype: int64"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "People['publishTime'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 去掉publishTime\n",
    "del People['publishTime']"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 7.expectPosition 处理"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "去除杂乱的符号，并用|来分割每一个职业"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0          [\"Hadoop大数据开发工程师\",\"数据分析师\"]\n",
       "1                  [\"Hadoop大数据开发工程师\"]\n",
       "2       [\"数据分析师\",\"数据挖掘工程师\",\"图像处理工程师\"]\n",
       "3                           [\"数据分析师\"]\n",
       "4                 [\"数据分析师\",\"数据挖掘工程师\"]\n",
       "                    ...              \n",
       "8276              [\"数据分析师\",\"数据挖掘工程师\"]\n",
       "8277              [\"数据分析师\",\"数据挖掘工程师\"]\n",
       "8278              [\"数据分析师\",\"数据挖掘工程师\"]\n",
       "8279              [\"数据分析师\",\"数据挖掘工程师\"]\n",
       "8280              [\"数据分析师\",\"数据挖掘工程师\"]\n",
       "Name: expectPosition, Length: 8281, dtype: object"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "People['expectPosition']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "def f(x):\n",
    "    x = x.replace(\"[\",\"\")\n",
    "    x = x.replace(\"]\",\"\")\n",
    "    x = x.replace(\"\\\"\",\"\")\n",
    "    x = x.split(\",\")\n",
    "    ans = \"\"\n",
    "    if len(x)>1:\n",
    "        for i in range(len(x)):\n",
    "            if i!= len(x)-1:\n",
    "                ans+=x[i]+\"|\"\n",
    "            else:\n",
    "                ans += x[i]\n",
    "    else:\n",
    "        ans += x[0]\n",
    "    return ans\n",
    "People['expectPosition'] = People['expectPosition'].map(f)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# People['expectPosition'].value_counts().to_csv(\"temp.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "#去除两条脏数据\n",
    "def f(x):\n",
    "    if \"数据分析师\" in x and \"\\\\\" in x and \"数据挖掘工程师\" in x:\n",
    "        return \"数据分析师|数据挖掘工程师\"\n",
    "    elif \"数据分析师\" in x and \"\\\\\" in x and \"数据挖掘工程师\" not in x:\n",
    "        return \"数据分析师\"\n",
    "    else:\n",
    "        return x\n",
    "        \n",
    "People['expectPosition'] = People['expectPosition'].map(f)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## PersonDetail处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>resumeId</th>\n",
       "      <th>resumeName</th>\n",
       "      <th>birthday</th>\n",
       "      <th>address</th>\n",
       "      <th>arrivalTime</th>\n",
       "      <th>politicalStatus</th>\n",
       "      <th>selfEvaluation</th>\n",
       "      <th>expectIndustry</th>\n",
       "      <th>willNature</th>\n",
       "      <th>keywordList</th>\n",
       "      <th>educationExperienceList</th>\n",
       "      <th>projectExperienceList</th>\n",
       "      <th>competitionExperienceList</th>\n",
       "      <th>trainingExperienceList</th>\n",
       "      <th>skillList</th>\n",
       "      <th>languageList</th>\n",
       "      <th>certList</th>\n",
       "      <th>workExperienceList</th>\n",
       "      <th>attachmentList</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1573938917198135296</td>\n",
       "      <td>湖南科技大学-大数据-李敏佳</td>\n",
       "      <td>2001-01-14 00:00:00.0</td>\n",
       "      <td>[\"湖南省\",\"湘潭市\",\"雨湖区\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>LEAGUE_MEMBER</td>\n",
       "      <td>掌握Java、Python的基础语法，以及常用的数据结构与算法\\n掌握Hadoop、Hive...</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>2.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>['湖南科技大学|数据科学与大数据技术|本科|2019-09-01|2023-06-17']</td>\n",
       "      <td>['工资管理系统|None|核心成员|2020-11-28|2020-12-10|使用Jav...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['英语|GOOD']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['湖南科技大学-数据开发-李敏佳|None|/userywfmzg/16642547805...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1541793867911790592</td>\n",
       "      <td>前端开发-殷浩</td>\n",
       "      <td>1999-03-07 00:00:00.0</td>\n",
       "      <td>[\"湖南省\",\"长沙市\",\"开福区\"]</td>\n",
       "      <td>2周后到岗</td>\n",
       "      <td>LEAGUE_MEMBER</td>\n",
       "      <td>具有较强的逻辑分析和再学习能力，对前端领域有所认知，积极进取，勇于挑战，为人热情，工作勤奋刻...</td>\n",
       "      <td>[\"不限\"]</td>\n",
       "      <td>2.0</td>\n",
       "      <td>['前端开发']</td>\n",
       "      <td>['湖南信息学院|网络工程|本科|2017-09-01|2022-06-28']</td>\n",
       "      <td>['校园综合平台小程序|湖南信息学院|前端开发|2021-03-03|2021-05-05|...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['英语|COMMONLY']</td>\n",
       "      <td>[]</td>\n",
       "      <td>['None|湖南游掌竟网络科技有限公司|[\"游戏\"]|前端开发实习|2021-07-01|...</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1569514123790778368</td>\n",
       "      <td>简历</td>\n",
       "      <td>2001-04-11 00:00:00.0</td>\n",
       "      <td>[\"湖北省\",\"武汉市\",\"洪山区\"]</td>\n",
       "      <td>时间待议</td>\n",
       "      <td>LEAGUE_MEMBER</td>\n",
       "      <td>1.本人性格稳重随和、谈吐幽默风趣、具有优秀的社交能力和的组织协调能力。\\n2.适应能力强、...</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['其他|“育知联杯”智慧商业大数据创新应用大赛|一等奖|None', '其他|湖北省信创大...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['工信部证书|达梦认证证书|2022-05-26|', '泰迪科技实习证明|泰迪科技实习证...</td>\n",
       "      <td>['None|泰迪智能科技股份有限公司|[\"数据服务\",\"互联网\"]|数据分析师|2022-...</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1550096565136392192</td>\n",
       "      <td>蓝梓坚</td>\n",
       "      <td>2001-01-04 00:00:00.0</td>\n",
       "      <td>[\"广东省\",\"梅州市\",\"兴宁市\"]</td>\n",
       "      <td>时间待议</td>\n",
       "      <td>LEAGUE_MEMBER</td>\n",
       "      <td>本人有较强的组织协调能力、活动策划能力和公关能力;具有良好的团队精神，善于与人沟通和协作;社...</td>\n",
       "      <td>[\"不限\"]</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>['广州大学|数学与应用数学精算专业|本科|2019-09-01|2023-07-23']</td>\n",
       "      <td>[]</td>\n",
       "      <td>['其他|“互联网+”大学生创业大赛|校级铜奖|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>['计算机二级|SKILLED']</td>\n",
       "      <td>['英语|GOOD']</td>\n",
       "      <td>[]</td>\n",
       "      <td>['None|新东方教育科技有限公司|[\"在线教育\"]|学管师|2021-07-01|202...</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1554782002648055808</td>\n",
       "      <td>个人简历</td>\n",
       "      <td>1999-08-05 00:00:00.0</td>\n",
       "      <td>[\"广东省\",\"广州市\",\"白云区\"]</td>\n",
       "      <td>1周后到岗</td>\n",
       "      <td>PARTY_MEMBER</td>\n",
       "      <td>1、性格开朗外向，珍惜每一次学习机会，有较好的自学能力以及自我管理能力。\\n2、专业水平扎实...</td>\n",
       "      <td>[\"互联网\",\"金融\",\"在线教育\"]</td>\n",
       "      <td>2.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>['韩山师范学院|统计学|本科|2018-09-15|2022-06-27']</td>\n",
       "      <td>['航空公司客户价值分析|None|数据分析师|2021-11-18|2021-12-01|...</td>\n",
       "      <td>['泰迪杯|第八届泰迪杯数据挖掘挑战赛|三等奖|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>['python|GOOD']</td>\n",
       "      <td>['英语四级|GOOD']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10868</th>\n",
       "      <td>7539911466257411604</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1993-12-04 00:00:00.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['CBDA技能证书|大数据分析工程师-基础级|2018-04-13|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10869</th>\n",
       "      <td>7539911474847346196</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1993-08-22 00:00:00.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10870</th>\n",
       "      <td>7539911500617149972</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1995-08-08 00:00:00.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['CBDA技能证书|大数据分析工程师-基础级|2018-04-13|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10871</th>\n",
       "      <td>7539911483437280788</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1994-11-24 00:00:00.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10872</th>\n",
       "      <td>7539911492027215380</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1993-03-04 00:00:00.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8281 rows × 19 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                  resumeId      resumeName               birthday  \\\n",
       "0      1573938917198135296  湖南科技大学-大数据-李敏佳  2001-01-14 00:00:00.0   \n",
       "1      1541793867911790592         前端开发-殷浩  1999-03-07 00:00:00.0   \n",
       "2      1569514123790778368              简历  2001-04-11 00:00:00.0   \n",
       "3      1550096565136392192             蓝梓坚  2001-01-04 00:00:00.0   \n",
       "4      1554782002648055808            个人简历  1999-08-05 00:00:00.0   \n",
       "...                    ...             ...                    ...   \n",
       "10868  7539911466257411604             NaN  1993-12-04 00:00:00.0   \n",
       "10869  7539911474847346196             NaN  1993-08-22 00:00:00.0   \n",
       "10870  7539911500617149972             NaN  1995-08-08 00:00:00.0   \n",
       "10871  7539911483437280788             NaN  1994-11-24 00:00:00.0   \n",
       "10872  7539911492027215380             NaN  1993-03-04 00:00:00.0   \n",
       "\n",
       "                   address arrivalTime politicalStatus  \\\n",
       "0      [\"湖南省\",\"湘潭市\",\"雨湖区\"]         NaN   LEAGUE_MEMBER   \n",
       "1      [\"湖南省\",\"长沙市\",\"开福区\"]       2周后到岗   LEAGUE_MEMBER   \n",
       "2      [\"湖北省\",\"武汉市\",\"洪山区\"]        时间待议   LEAGUE_MEMBER   \n",
       "3      [\"广东省\",\"梅州市\",\"兴宁市\"]        时间待议   LEAGUE_MEMBER   \n",
       "4      [\"广东省\",\"广州市\",\"白云区\"]       1周后到岗    PARTY_MEMBER   \n",
       "...                    ...         ...             ...   \n",
       "10868                  NaN         NaN             NaN   \n",
       "10869                  NaN         NaN             NaN   \n",
       "10870                  NaN         NaN             NaN   \n",
       "10871                  NaN         NaN             NaN   \n",
       "10872                  NaN         NaN             NaN   \n",
       "\n",
       "                                          selfEvaluation       expectIndustry  \\\n",
       "0      掌握Java、Python的基础语法，以及常用的数据结构与算法\\n掌握Hadoop、Hive...              [\"互联网\"]   \n",
       "1      具有较强的逻辑分析和再学习能力，对前端领域有所认知，积极进取，勇于挑战，为人热情，工作勤奋刻...               [\"不限\"]   \n",
       "2      1.本人性格稳重随和、谈吐幽默风趣、具有优秀的社交能力和的组织协调能力。\\n2.适应能力强、...              [\"互联网\"]   \n",
       "3      本人有较强的组织协调能力、活动策划能力和公关能力;具有良好的团队精神，善于与人沟通和协作;社...               [\"不限\"]   \n",
       "4      1、性格开朗外向，珍惜每一次学习机会，有较好的自学能力以及自我管理能力。\\n2、专业水平扎实...  [\"互联网\",\"金融\",\"在线教育\"]   \n",
       "...                                                  ...                  ...   \n",
       "10868                                                NaN              [\"互联网\"]   \n",
       "10869                                                NaN              [\"互联网\"]   \n",
       "10870                                                NaN              [\"互联网\"]   \n",
       "10871                                                NaN              [\"互联网\"]   \n",
       "10872                                                NaN              [\"互联网\"]   \n",
       "\n",
       "       willNature keywordList                         educationExperienceList  \\\n",
       "0             2.0          []  ['湖南科技大学|数据科学与大数据技术|本科|2019-09-01|2023-06-17']   \n",
       "1             2.0    ['前端开发']        ['湖南信息学院|网络工程|本科|2017-09-01|2022-06-28']   \n",
       "2             0.0          []                                              []   \n",
       "3             0.0          []   ['广州大学|数学与应用数学精算专业|本科|2019-09-01|2023-07-23']   \n",
       "4             2.0          []         ['韩山师范学院|统计学|本科|2018-09-15|2022-06-27']   \n",
       "...           ...         ...                                             ...   \n",
       "10868         NaN          []                                              []   \n",
       "10869         NaN          []                                              []   \n",
       "10870         NaN          []                                              []   \n",
       "10871         NaN          []                                              []   \n",
       "10872         NaN          []                                              []   \n",
       "\n",
       "                                   projectExperienceList  \\\n",
       "0      ['工资管理系统|None|核心成员|2020-11-28|2020-12-10|使用Jav...   \n",
       "1      ['校园综合平台小程序|湖南信息学院|前端开发|2021-03-03|2021-05-05|...   \n",
       "2                                                     []   \n",
       "3                                                     []   \n",
       "4      ['航空公司客户价值分析|None|数据分析师|2021-11-18|2021-12-01|...   \n",
       "...                                                  ...   \n",
       "10868                                                 []   \n",
       "10869                                                 []   \n",
       "10870                                                 []   \n",
       "10871                                                 []   \n",
       "10872                                                 []   \n",
       "\n",
       "                               competitionExperienceList  \\\n",
       "0                                                     []   \n",
       "1                                                     []   \n",
       "2      ['其他|“育知联杯”智慧商业大数据创新应用大赛|一等奖|None', '其他|湖北省信创大...   \n",
       "3                         ['其他|“互联网+”大学生创业大赛|校级铜奖|None']   \n",
       "4                         ['泰迪杯|第八届泰迪杯数据挖掘挑战赛|三等奖|None']   \n",
       "...                                                  ...   \n",
       "10868                                                 []   \n",
       "10869                                                 []   \n",
       "10870                                                 []   \n",
       "10871                                                 []   \n",
       "10872                                                 []   \n",
       "\n",
       "      trainingExperienceList          skillList     languageList  \\\n",
       "0                         []                 []      ['英语|GOOD']   \n",
       "1                         []                 []  ['英语|COMMONLY']   \n",
       "2                         []                 []               []   \n",
       "3                         []  ['计算机二级|SKILLED']      ['英语|GOOD']   \n",
       "4                         []    ['python|GOOD']    ['英语四级|GOOD']   \n",
       "...                      ...                ...              ...   \n",
       "10868                     []                 []               []   \n",
       "10869                     []                 []               []   \n",
       "10870                     []                 []               []   \n",
       "10871                     []                 []               []   \n",
       "10872                     []                 []               []   \n",
       "\n",
       "                                                certList  \\\n",
       "0                                                     []   \n",
       "1                                                     []   \n",
       "2      ['工信部证书|达梦认证证书|2022-05-26|', '泰迪科技实习证明|泰迪科技实习证...   \n",
       "3                                                     []   \n",
       "4                                                     []   \n",
       "...                                                  ...   \n",
       "10868          ['CBDA技能证书|大数据分析工程师-基础级|2018-04-13|None']   \n",
       "10869          ['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']   \n",
       "10870          ['CBDA技能证书|大数据分析工程师-基础级|2018-04-13|None']   \n",
       "10871          ['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']   \n",
       "10872          ['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']   \n",
       "\n",
       "                                      workExperienceList  \\\n",
       "0                                                     []   \n",
       "1      ['None|湖南游掌竟网络科技有限公司|[\"游戏\"]|前端开发实习|2021-07-01|...   \n",
       "2      ['None|泰迪智能科技股份有限公司|[\"数据服务\",\"互联网\"]|数据分析师|2022-...   \n",
       "3      ['None|新东方教育科技有限公司|[\"在线教育\"]|学管师|2021-07-01|202...   \n",
       "4                                                     []   \n",
       "...                                                  ...   \n",
       "10868                                                 []   \n",
       "10869                                                 []   \n",
       "10870                                                 []   \n",
       "10871                                                 []   \n",
       "10872                                                 []   \n",
       "\n",
       "                                          attachmentList  \n",
       "0      ['湖南科技大学-数据开发-李敏佳|None|/userywfmzg/16642547805...  \n",
       "1                                                     []  \n",
       "2                                                     []  \n",
       "3                                                     []  \n",
       "4                                                     []  \n",
       "...                                                  ...  \n",
       "10868                                                 []  \n",
       "10869                                                 []  \n",
       "10870                                                 []  \n",
       "10871                                                 []  \n",
       "10872                                                 []  \n",
       "\n",
       "[8281 rows x 19 columns]"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "PeopleDetail"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.标签删除\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "简历名称鱼龙混杂，数据少，且格式什么样的都有，删掉，缺失率达98%多"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1         11\n",
       "简历         9\n",
       "数据分析       8\n",
       "个人简历       7\n",
       "数据分析师      7\n",
       "          ..\n",
       "唐颖         1\n",
       "田昕钰        1\n",
       "李格桑        1\n",
       "陈择军+韩师     1\n",
       "商业数据分析     1\n",
       "Name: resumeName, Length: 156, dtype: int64"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "PeopleDetail[\"resumeName\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "del PeopleDetail[\"resumeName\"]"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.将出生日期转换为年龄"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2000-07-09 00:00:00.0    5\n",
       "2000-06-15 00:00:00.0    5\n",
       "1996-12-07 00:00:00.0    4\n",
       "1999-11-10 00:00:00.0    4\n",
       "1998-11-14 00:00:00.0    4\n",
       "                        ..\n",
       "2000-10-27 00:00:00.0    1\n",
       "1995-11-13 00:00:00.0    1\n",
       "1998-07-04 00:00:00.0    1\n",
       "1998-09-23 00:00:00.0    1\n",
       "1993-03-04 00:00:00.0    1\n",
       "Name: birthday, Length: 1150, dtype: int64"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "PeopleDetail['birthday'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "def f(x):\n",
    "    import re\n",
    "    if x:\n",
    "        if \"T\" in str(x):\n",
    "            new_date_string = re.sub(r'T.*$', '', x)\n",
    "            return new_date_string\n",
    "        else:\n",
    "            x = str(x).split(\" \")\n",
    "            return x[0]\n",
    "    return x\n",
    "PeopleDetail['birthday'] = PeopleDetail['birthday'].map(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>resumeId</th>\n",
       "      <th>birthday</th>\n",
       "      <th>address</th>\n",
       "      <th>arrivalTime</th>\n",
       "      <th>politicalStatus</th>\n",
       "      <th>selfEvaluation</th>\n",
       "      <th>expectIndustry</th>\n",
       "      <th>willNature</th>\n",
       "      <th>keywordList</th>\n",
       "      <th>educationExperienceList</th>\n",
       "      <th>projectExperienceList</th>\n",
       "      <th>competitionExperienceList</th>\n",
       "      <th>trainingExperienceList</th>\n",
       "      <th>skillList</th>\n",
       "      <th>languageList</th>\n",
       "      <th>certList</th>\n",
       "      <th>workExperienceList</th>\n",
       "      <th>attachmentList</th>\n",
       "      <th>age</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1573938917198135296</td>\n",
       "      <td>2001-01-14</td>\n",
       "      <td>[\"湖南省\",\"湘潭市\",\"雨湖区\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>LEAGUE_MEMBER</td>\n",
       "      <td>掌握Java、Python的基础语法，以及常用的数据结构与算法\\n掌握Hadoop、Hive...</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>2.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>['湖南科技大学|数据科学与大数据技术|本科|2019-09-01|2023-06-17']</td>\n",
       "      <td>['工资管理系统|None|核心成员|2020-11-28|2020-12-10|使用Jav...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['英语|GOOD']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['湖南科技大学-数据开发-李敏佳|None|/userywfmzg/16642547805...</td>\n",
       "      <td>22.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1541793867911790592</td>\n",
       "      <td>1999-03-07</td>\n",
       "      <td>[\"湖南省\",\"长沙市\",\"开福区\"]</td>\n",
       "      <td>2周后到岗</td>\n",
       "      <td>LEAGUE_MEMBER</td>\n",
       "      <td>具有较强的逻辑分析和再学习能力，对前端领域有所认知，积极进取，勇于挑战，为人热情，工作勤奋刻...</td>\n",
       "      <td>[\"不限\"]</td>\n",
       "      <td>2.0</td>\n",
       "      <td>['前端开发']</td>\n",
       "      <td>['湖南信息学院|网络工程|本科|2017-09-01|2022-06-28']</td>\n",
       "      <td>['校园综合平台小程序|湖南信息学院|前端开发|2021-03-03|2021-05-05|...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['英语|COMMONLY']</td>\n",
       "      <td>[]</td>\n",
       "      <td>['None|湖南游掌竟网络科技有限公司|[\"游戏\"]|前端开发实习|2021-07-01|...</td>\n",
       "      <td>[]</td>\n",
       "      <td>24.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1569514123790778368</td>\n",
       "      <td>2001-04-11</td>\n",
       "      <td>[\"湖北省\",\"武汉市\",\"洪山区\"]</td>\n",
       "      <td>时间待议</td>\n",
       "      <td>LEAGUE_MEMBER</td>\n",
       "      <td>1.本人性格稳重随和、谈吐幽默风趣、具有优秀的社交能力和的组织协调能力。\\n2.适应能力强、...</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['其他|“育知联杯”智慧商业大数据创新应用大赛|一等奖|None', '其他|湖北省信创大...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['工信部证书|达梦认证证书|2022-05-26|', '泰迪科技实习证明|泰迪科技实习证...</td>\n",
       "      <td>['None|泰迪智能科技股份有限公司|[\"数据服务\",\"互联网\"]|数据分析师|2022-...</td>\n",
       "      <td>[]</td>\n",
       "      <td>22.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1550096565136392192</td>\n",
       "      <td>2001-01-04</td>\n",
       "      <td>[\"广东省\",\"梅州市\",\"兴宁市\"]</td>\n",
       "      <td>时间待议</td>\n",
       "      <td>LEAGUE_MEMBER</td>\n",
       "      <td>本人有较强的组织协调能力、活动策划能力和公关能力;具有良好的团队精神，善于与人沟通和协作;社...</td>\n",
       "      <td>[\"不限\"]</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>['广州大学|数学与应用数学精算专业|本科|2019-09-01|2023-07-23']</td>\n",
       "      <td>[]</td>\n",
       "      <td>['其他|“互联网+”大学生创业大赛|校级铜奖|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>['计算机二级|SKILLED']</td>\n",
       "      <td>['英语|GOOD']</td>\n",
       "      <td>[]</td>\n",
       "      <td>['None|新东方教育科技有限公司|[\"在线教育\"]|学管师|2021-07-01|202...</td>\n",
       "      <td>[]</td>\n",
       "      <td>22.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1554782002648055808</td>\n",
       "      <td>1999-08-05</td>\n",
       "      <td>[\"广东省\",\"广州市\",\"白云区\"]</td>\n",
       "      <td>1周后到岗</td>\n",
       "      <td>PARTY_MEMBER</td>\n",
       "      <td>1、性格开朗外向，珍惜每一次学习机会，有较好的自学能力以及自我管理能力。\\n2、专业水平扎实...</td>\n",
       "      <td>[\"互联网\",\"金融\",\"在线教育\"]</td>\n",
       "      <td>2.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>['韩山师范学院|统计学|本科|2018-09-15|2022-06-27']</td>\n",
       "      <td>['航空公司客户价值分析|None|数据分析师|2021-11-18|2021-12-01|...</td>\n",
       "      <td>['泰迪杯|第八届泰迪杯数据挖掘挑战赛|三等奖|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>['python|GOOD']</td>\n",
       "      <td>['英语四级|GOOD']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>23.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10868</th>\n",
       "      <td>7539911466257411604</td>\n",
       "      <td>1993-12-04</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['CBDA技能证书|大数据分析工程师-基础级|2018-04-13|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>29.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10869</th>\n",
       "      <td>7539911474847346196</td>\n",
       "      <td>1993-08-22</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>29.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10870</th>\n",
       "      <td>7539911500617149972</td>\n",
       "      <td>1995-08-08</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['CBDA技能证书|大数据分析工程师-基础级|2018-04-13|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>27.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10871</th>\n",
       "      <td>7539911483437280788</td>\n",
       "      <td>1994-11-24</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>28.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10872</th>\n",
       "      <td>7539911492027215380</td>\n",
       "      <td>1993-03-04</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>30.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8281 rows × 19 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                  resumeId    birthday              address arrivalTime  \\\n",
       "0      1573938917198135296  2001-01-14  [\"湖南省\",\"湘潭市\",\"雨湖区\"]         NaN   \n",
       "1      1541793867911790592  1999-03-07  [\"湖南省\",\"长沙市\",\"开福区\"]       2周后到岗   \n",
       "2      1569514123790778368  2001-04-11  [\"湖北省\",\"武汉市\",\"洪山区\"]        时间待议   \n",
       "3      1550096565136392192  2001-01-04  [\"广东省\",\"梅州市\",\"兴宁市\"]        时间待议   \n",
       "4      1554782002648055808  1999-08-05  [\"广东省\",\"广州市\",\"白云区\"]       1周后到岗   \n",
       "...                    ...         ...                  ...         ...   \n",
       "10868  7539911466257411604  1993-12-04                  NaN         NaN   \n",
       "10869  7539911474847346196  1993-08-22                  NaN         NaN   \n",
       "10870  7539911500617149972  1995-08-08                  NaN         NaN   \n",
       "10871  7539911483437280788  1994-11-24                  NaN         NaN   \n",
       "10872  7539911492027215380  1993-03-04                  NaN         NaN   \n",
       "\n",
       "      politicalStatus                                     selfEvaluation  \\\n",
       "0       LEAGUE_MEMBER  掌握Java、Python的基础语法，以及常用的数据结构与算法\\n掌握Hadoop、Hive...   \n",
       "1       LEAGUE_MEMBER  具有较强的逻辑分析和再学习能力，对前端领域有所认知，积极进取，勇于挑战，为人热情，工作勤奋刻...   \n",
       "2       LEAGUE_MEMBER  1.本人性格稳重随和、谈吐幽默风趣、具有优秀的社交能力和的组织协调能力。\\n2.适应能力强、...   \n",
       "3       LEAGUE_MEMBER  本人有较强的组织协调能力、活动策划能力和公关能力;具有良好的团队精神，善于与人沟通和协作;社...   \n",
       "4        PARTY_MEMBER  1、性格开朗外向，珍惜每一次学习机会，有较好的自学能力以及自我管理能力。\\n2、专业水平扎实...   \n",
       "...               ...                                                ...   \n",
       "10868             NaN                                                NaN   \n",
       "10869             NaN                                                NaN   \n",
       "10870             NaN                                                NaN   \n",
       "10871             NaN                                                NaN   \n",
       "10872             NaN                                                NaN   \n",
       "\n",
       "            expectIndustry  willNature keywordList  \\\n",
       "0                  [\"互联网\"]         2.0          []   \n",
       "1                   [\"不限\"]         2.0    ['前端开发']   \n",
       "2                  [\"互联网\"]         0.0          []   \n",
       "3                   [\"不限\"]         0.0          []   \n",
       "4      [\"互联网\",\"金融\",\"在线教育\"]         2.0          []   \n",
       "...                    ...         ...         ...   \n",
       "10868              [\"互联网\"]         NaN          []   \n",
       "10869              [\"互联网\"]         NaN          []   \n",
       "10870              [\"互联网\"]         NaN          []   \n",
       "10871              [\"互联网\"]         NaN          []   \n",
       "10872              [\"互联网\"]         NaN          []   \n",
       "\n",
       "                              educationExperienceList  \\\n",
       "0      ['湖南科技大学|数据科学与大数据技术|本科|2019-09-01|2023-06-17']   \n",
       "1            ['湖南信息学院|网络工程|本科|2017-09-01|2022-06-28']   \n",
       "2                                                  []   \n",
       "3       ['广州大学|数学与应用数学精算专业|本科|2019-09-01|2023-07-23']   \n",
       "4             ['韩山师范学院|统计学|本科|2018-09-15|2022-06-27']   \n",
       "...                                               ...   \n",
       "10868                                              []   \n",
       "10869                                              []   \n",
       "10870                                              []   \n",
       "10871                                              []   \n",
       "10872                                              []   \n",
       "\n",
       "                                   projectExperienceList  \\\n",
       "0      ['工资管理系统|None|核心成员|2020-11-28|2020-12-10|使用Jav...   \n",
       "1      ['校园综合平台小程序|湖南信息学院|前端开发|2021-03-03|2021-05-05|...   \n",
       "2                                                     []   \n",
       "3                                                     []   \n",
       "4      ['航空公司客户价值分析|None|数据分析师|2021-11-18|2021-12-01|...   \n",
       "...                                                  ...   \n",
       "10868                                                 []   \n",
       "10869                                                 []   \n",
       "10870                                                 []   \n",
       "10871                                                 []   \n",
       "10872                                                 []   \n",
       "\n",
       "                               competitionExperienceList  \\\n",
       "0                                                     []   \n",
       "1                                                     []   \n",
       "2      ['其他|“育知联杯”智慧商业大数据创新应用大赛|一等奖|None', '其他|湖北省信创大...   \n",
       "3                         ['其他|“互联网+”大学生创业大赛|校级铜奖|None']   \n",
       "4                         ['泰迪杯|第八届泰迪杯数据挖掘挑战赛|三等奖|None']   \n",
       "...                                                  ...   \n",
       "10868                                                 []   \n",
       "10869                                                 []   \n",
       "10870                                                 []   \n",
       "10871                                                 []   \n",
       "10872                                                 []   \n",
       "\n",
       "      trainingExperienceList          skillList     languageList  \\\n",
       "0                         []                 []      ['英语|GOOD']   \n",
       "1                         []                 []  ['英语|COMMONLY']   \n",
       "2                         []                 []               []   \n",
       "3                         []  ['计算机二级|SKILLED']      ['英语|GOOD']   \n",
       "4                         []    ['python|GOOD']    ['英语四级|GOOD']   \n",
       "...                      ...                ...              ...   \n",
       "10868                     []                 []               []   \n",
       "10869                     []                 []               []   \n",
       "10870                     []                 []               []   \n",
       "10871                     []                 []               []   \n",
       "10872                     []                 []               []   \n",
       "\n",
       "                                                certList  \\\n",
       "0                                                     []   \n",
       "1                                                     []   \n",
       "2      ['工信部证书|达梦认证证书|2022-05-26|', '泰迪科技实习证明|泰迪科技实习证...   \n",
       "3                                                     []   \n",
       "4                                                     []   \n",
       "...                                                  ...   \n",
       "10868          ['CBDA技能证书|大数据分析工程师-基础级|2018-04-13|None']   \n",
       "10869          ['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']   \n",
       "10870          ['CBDA技能证书|大数据分析工程师-基础级|2018-04-13|None']   \n",
       "10871          ['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']   \n",
       "10872          ['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']   \n",
       "\n",
       "                                      workExperienceList  \\\n",
       "0                                                     []   \n",
       "1      ['None|湖南游掌竟网络科技有限公司|[\"游戏\"]|前端开发实习|2021-07-01|...   \n",
       "2      ['None|泰迪智能科技股份有限公司|[\"数据服务\",\"互联网\"]|数据分析师|2022-...   \n",
       "3      ['None|新东方教育科技有限公司|[\"在线教育\"]|学管师|2021-07-01|202...   \n",
       "4                                                     []   \n",
       "...                                                  ...   \n",
       "10868                                                 []   \n",
       "10869                                                 []   \n",
       "10870                                                 []   \n",
       "10871                                                 []   \n",
       "10872                                                 []   \n",
       "\n",
       "                                          attachmentList   age  \n",
       "0      ['湖南科技大学-数据开发-李敏佳|None|/userywfmzg/16642547805...  22.0  \n",
       "1                                                     []  24.0  \n",
       "2                                                     []  22.0  \n",
       "3                                                     []  22.0  \n",
       "4                                                     []  23.0  \n",
       "...                                                  ...   ...  \n",
       "10868                                                 []  29.0  \n",
       "10869                                                 []  29.0  \n",
       "10870                                                 []  27.0  \n",
       "10871                                                 []  28.0  \n",
       "10872                                                 []  30.0  \n",
       "\n",
       "[8281 rows x 19 columns]"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import datetime\n",
    "current_time = datetime.datetime.now()\n",
    "PeopleDetail['age'] = (current_time - pd.to_datetime(PeopleDetail['birthday'])).astype('<m8[Y]')\n",
    "PeopleDetail"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.address处理\n",
    "\n",
    "有两条数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[\"广东省\",\"广州市\",\"天河区\"]    81\n",
       "[\"广东省\",\"广州市\",\"黄埔区\"]    41\n",
       "[\"北京市\",\"北京市\",\"东城区\"]     7\n",
       "[\"天津市\",\"天津市\",\"和平区\"]     5\n",
       "[\"广东省\",\"广州市\",\"白云区\"]     5\n",
       "                       ..\n",
       "[\"河北省\",\"邯郸市\",\"临漳县\"]     1\n",
       "[\"湖南省\",\"长沙市\",\"开福区\"]     1\n",
       "[\"福建省\",\"莆田市\",\"涵江区\"]     1\n",
       "[\"山东省\",\"菏泽市\",\"曹县\"]      1\n",
       "[\"广东省\",\"佛山市\",\"禅城区\"]     1\n",
       "Name: address, Length: 63, dtype: int64"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "PeopleDetail['address'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "def f(x):\n",
    "    if isinstance(x,str):\n",
    "        if \"\\\\\" in x and \"黄埔区\" in x:\n",
    "            return str([\"广东省\",\"广州市\",\"黄埔区\"])\n",
    "        elif \"\\\\\" in x and \"荔湾区\" in x:\n",
    "            return str([\"广东省\",\"广州市\",\"荔湾区\"])\n",
    "        else:\n",
    "            return x\n",
    "    return x\n",
    "            \n",
    "PeopleDetail['address'] = PeopleDetail['address'].map(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "split_df = PeopleDetail['address'].str.split(',', expand=True)\n",
    "del PeopleDetail['address']\n",
    "split_df = split_df.rename(columns={0: \"province\",1:\"city\",2:\"region\"})\n",
    "PeopleDetail = pd.concat([PeopleDetail, split_df], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "def f(x):\n",
    "    if isinstance(x,str):\n",
    "        x = x.replace(\"\\\"\",\"\")\n",
    "        x = x.replace(\"]\",\"\")\n",
    "        x = x.replace(\"[\",\"\")\n",
    "        x = x.replace(\"\\\\\",\"\")\n",
    "        return x\n",
    "PeopleDetail['province'] = PeopleDetail['province'].map(f1)\n",
    "PeopleDetail['city'] = PeopleDetail['city'].map(f1)\n",
    "PeopleDetail['region'] = PeopleDetail['region'].map(f1)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.政治状态处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>resumeId</th>\n",
       "      <th>birthday</th>\n",
       "      <th>arrivalTime</th>\n",
       "      <th>politicalStatus</th>\n",
       "      <th>selfEvaluation</th>\n",
       "      <th>expectIndustry</th>\n",
       "      <th>willNature</th>\n",
       "      <th>keywordList</th>\n",
       "      <th>educationExperienceList</th>\n",
       "      <th>projectExperienceList</th>\n",
       "      <th>...</th>\n",
       "      <th>trainingExperienceList</th>\n",
       "      <th>skillList</th>\n",
       "      <th>languageList</th>\n",
       "      <th>certList</th>\n",
       "      <th>workExperienceList</th>\n",
       "      <th>attachmentList</th>\n",
       "      <th>age</th>\n",
       "      <th>province</th>\n",
       "      <th>city</th>\n",
       "      <th>region</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1573938917198135296</td>\n",
       "      <td>2001-01-14</td>\n",
       "      <td>NaN</td>\n",
       "      <td>团员</td>\n",
       "      <td>掌握Java、Python的基础语法，以及常用的数据结构与算法\\n掌握Hadoop、Hive...</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>2.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>['湖南科技大学|数据科学与大数据技术|本科|2019-09-01|2023-06-17']</td>\n",
       "      <td>['工资管理系统|None|核心成员|2020-11-28|2020-12-10|使用Jav...</td>\n",
       "      <td>...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['英语|GOOD']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['湖南科技大学-数据开发-李敏佳|None|/userywfmzg/16642547805...</td>\n",
       "      <td>22.0</td>\n",
       "      <td>湖南省</td>\n",
       "      <td>湘潭市</td>\n",
       "      <td>雨湖区</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1541793867911790592</td>\n",
       "      <td>1999-03-07</td>\n",
       "      <td>2周后到岗</td>\n",
       "      <td>团员</td>\n",
       "      <td>具有较强的逻辑分析和再学习能力，对前端领域有所认知，积极进取，勇于挑战，为人热情，工作勤奋刻...</td>\n",
       "      <td>[\"不限\"]</td>\n",
       "      <td>2.0</td>\n",
       "      <td>['前端开发']</td>\n",
       "      <td>['湖南信息学院|网络工程|本科|2017-09-01|2022-06-28']</td>\n",
       "      <td>['校园综合平台小程序|湖南信息学院|前端开发|2021-03-03|2021-05-05|...</td>\n",
       "      <td>...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['英语|COMMONLY']</td>\n",
       "      <td>[]</td>\n",
       "      <td>['None|湖南游掌竟网络科技有限公司|[\"游戏\"]|前端开发实习|2021-07-01|...</td>\n",
       "      <td>[]</td>\n",
       "      <td>24.0</td>\n",
       "      <td>湖南省</td>\n",
       "      <td>长沙市</td>\n",
       "      <td>开福区</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1569514123790778368</td>\n",
       "      <td>2001-04-11</td>\n",
       "      <td>时间待议</td>\n",
       "      <td>团员</td>\n",
       "      <td>1.本人性格稳重随和、谈吐幽默风趣、具有优秀的社交能力和的组织协调能力。\\n2.适应能力强、...</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['工信部证书|达梦认证证书|2022-05-26|', '泰迪科技实习证明|泰迪科技实习证...</td>\n",
       "      <td>['None|泰迪智能科技股份有限公司|[\"数据服务\",\"互联网\"]|数据分析师|2022-...</td>\n",
       "      <td>[]</td>\n",
       "      <td>22.0</td>\n",
       "      <td>湖北省</td>\n",
       "      <td>武汉市</td>\n",
       "      <td>洪山区</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1550096565136392192</td>\n",
       "      <td>2001-01-04</td>\n",
       "      <td>时间待议</td>\n",
       "      <td>团员</td>\n",
       "      <td>本人有较强的组织协调能力、活动策划能力和公关能力;具有良好的团队精神，善于与人沟通和协作;社...</td>\n",
       "      <td>[\"不限\"]</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>['广州大学|数学与应用数学精算专业|本科|2019-09-01|2023-07-23']</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td>[]</td>\n",
       "      <td>['计算机二级|SKILLED']</td>\n",
       "      <td>['英语|GOOD']</td>\n",
       "      <td>[]</td>\n",
       "      <td>['None|新东方教育科技有限公司|[\"在线教育\"]|学管师|2021-07-01|202...</td>\n",
       "      <td>[]</td>\n",
       "      <td>22.0</td>\n",
       "      <td>广东省</td>\n",
       "      <td>梅州市</td>\n",
       "      <td>兴宁市</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1554782002648055808</td>\n",
       "      <td>1999-08-05</td>\n",
       "      <td>1周后到岗</td>\n",
       "      <td>中共党员</td>\n",
       "      <td>1、性格开朗外向，珍惜每一次学习机会，有较好的自学能力以及自我管理能力。\\n2、专业水平扎实...</td>\n",
       "      <td>[\"互联网\",\"金融\",\"在线教育\"]</td>\n",
       "      <td>2.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>['韩山师范学院|统计学|本科|2018-09-15|2022-06-27']</td>\n",
       "      <td>['航空公司客户价值分析|None|数据分析师|2021-11-18|2021-12-01|...</td>\n",
       "      <td>...</td>\n",
       "      <td>[]</td>\n",
       "      <td>['python|GOOD']</td>\n",
       "      <td>['英语四级|GOOD']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>23.0</td>\n",
       "      <td>广东省</td>\n",
       "      <td>广州市</td>\n",
       "      <td>白云区</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10868</th>\n",
       "      <td>7539911466257411604</td>\n",
       "      <td>1993-12-04</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['CBDA技能证书|大数据分析工程师-基础级|2018-04-13|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10869</th>\n",
       "      <td>7539911474847346196</td>\n",
       "      <td>1993-08-22</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10870</th>\n",
       "      <td>7539911500617149972</td>\n",
       "      <td>1995-08-08</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['CBDA技能证书|大数据分析工程师-基础级|2018-04-13|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>27.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10871</th>\n",
       "      <td>7539911483437280788</td>\n",
       "      <td>1994-11-24</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>28.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10872</th>\n",
       "      <td>7539911492027215380</td>\n",
       "      <td>1993-03-04</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>30.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8281 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                  resumeId    birthday arrivalTime politicalStatus  \\\n",
       "0      1573938917198135296  2001-01-14         NaN              团员   \n",
       "1      1541793867911790592  1999-03-07       2周后到岗              团员   \n",
       "2      1569514123790778368  2001-04-11        时间待议              团员   \n",
       "3      1550096565136392192  2001-01-04        时间待议              团员   \n",
       "4      1554782002648055808  1999-08-05       1周后到岗            中共党员   \n",
       "...                    ...         ...         ...             ...   \n",
       "10868  7539911466257411604  1993-12-04         NaN             NaN   \n",
       "10869  7539911474847346196  1993-08-22         NaN             NaN   \n",
       "10870  7539911500617149972  1995-08-08         NaN             NaN   \n",
       "10871  7539911483437280788  1994-11-24         NaN             NaN   \n",
       "10872  7539911492027215380  1993-03-04         NaN             NaN   \n",
       "\n",
       "                                          selfEvaluation       expectIndustry  \\\n",
       "0      掌握Java、Python的基础语法，以及常用的数据结构与算法\\n掌握Hadoop、Hive...              [\"互联网\"]   \n",
       "1      具有较强的逻辑分析和再学习能力，对前端领域有所认知，积极进取，勇于挑战，为人热情，工作勤奋刻...               [\"不限\"]   \n",
       "2      1.本人性格稳重随和、谈吐幽默风趣、具有优秀的社交能力和的组织协调能力。\\n2.适应能力强、...              [\"互联网\"]   \n",
       "3      本人有较强的组织协调能力、活动策划能力和公关能力;具有良好的团队精神，善于与人沟通和协作;社...               [\"不限\"]   \n",
       "4      1、性格开朗外向，珍惜每一次学习机会，有较好的自学能力以及自我管理能力。\\n2、专业水平扎实...  [\"互联网\",\"金融\",\"在线教育\"]   \n",
       "...                                                  ...                  ...   \n",
       "10868                                                NaN              [\"互联网\"]   \n",
       "10869                                                NaN              [\"互联网\"]   \n",
       "10870                                                NaN              [\"互联网\"]   \n",
       "10871                                                NaN              [\"互联网\"]   \n",
       "10872                                                NaN              [\"互联网\"]   \n",
       "\n",
       "       willNature keywordList                         educationExperienceList  \\\n",
       "0             2.0          []  ['湖南科技大学|数据科学与大数据技术|本科|2019-09-01|2023-06-17']   \n",
       "1             2.0    ['前端开发']        ['湖南信息学院|网络工程|本科|2017-09-01|2022-06-28']   \n",
       "2             0.0          []                                              []   \n",
       "3             0.0          []   ['广州大学|数学与应用数学精算专业|本科|2019-09-01|2023-07-23']   \n",
       "4             2.0          []         ['韩山师范学院|统计学|本科|2018-09-15|2022-06-27']   \n",
       "...           ...         ...                                             ...   \n",
       "10868         NaN          []                                              []   \n",
       "10869         NaN          []                                              []   \n",
       "10870         NaN          []                                              []   \n",
       "10871         NaN          []                                              []   \n",
       "10872         NaN          []                                              []   \n",
       "\n",
       "                                   projectExperienceList  ...  \\\n",
       "0      ['工资管理系统|None|核心成员|2020-11-28|2020-12-10|使用Jav...  ...   \n",
       "1      ['校园综合平台小程序|湖南信息学院|前端开发|2021-03-03|2021-05-05|...  ...   \n",
       "2                                                     []  ...   \n",
       "3                                                     []  ...   \n",
       "4      ['航空公司客户价值分析|None|数据分析师|2021-11-18|2021-12-01|...  ...   \n",
       "...                                                  ...  ...   \n",
       "10868                                                 []  ...   \n",
       "10869                                                 []  ...   \n",
       "10870                                                 []  ...   \n",
       "10871                                                 []  ...   \n",
       "10872                                                 []  ...   \n",
       "\n",
       "      trainingExperienceList          skillList     languageList  \\\n",
       "0                         []                 []      ['英语|GOOD']   \n",
       "1                         []                 []  ['英语|COMMONLY']   \n",
       "2                         []                 []               []   \n",
       "3                         []  ['计算机二级|SKILLED']      ['英语|GOOD']   \n",
       "4                         []    ['python|GOOD']    ['英语四级|GOOD']   \n",
       "...                      ...                ...              ...   \n",
       "10868                     []                 []               []   \n",
       "10869                     []                 []               []   \n",
       "10870                     []                 []               []   \n",
       "10871                     []                 []               []   \n",
       "10872                     []                 []               []   \n",
       "\n",
       "                                                certList  \\\n",
       "0                                                     []   \n",
       "1                                                     []   \n",
       "2      ['工信部证书|达梦认证证书|2022-05-26|', '泰迪科技实习证明|泰迪科技实习证...   \n",
       "3                                                     []   \n",
       "4                                                     []   \n",
       "...                                                  ...   \n",
       "10868          ['CBDA技能证书|大数据分析工程师-基础级|2018-04-13|None']   \n",
       "10869          ['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']   \n",
       "10870          ['CBDA技能证书|大数据分析工程师-基础级|2018-04-13|None']   \n",
       "10871          ['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']   \n",
       "10872          ['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']   \n",
       "\n",
       "                                      workExperienceList  \\\n",
       "0                                                     []   \n",
       "1      ['None|湖南游掌竟网络科技有限公司|[\"游戏\"]|前端开发实习|2021-07-01|...   \n",
       "2      ['None|泰迪智能科技股份有限公司|[\"数据服务\",\"互联网\"]|数据分析师|2022-...   \n",
       "3      ['None|新东方教育科技有限公司|[\"在线教育\"]|学管师|2021-07-01|202...   \n",
       "4                                                     []   \n",
       "...                                                  ...   \n",
       "10868                                                 []   \n",
       "10869                                                 []   \n",
       "10870                                                 []   \n",
       "10871                                                 []   \n",
       "10872                                                 []   \n",
       "\n",
       "                                          attachmentList   age  province city  \\\n",
       "0      ['湖南科技大学-数据开发-李敏佳|None|/userywfmzg/16642547805...  22.0       湖南省  湘潭市   \n",
       "1                                                     []  24.0       湖南省  长沙市   \n",
       "2                                                     []  22.0       湖北省  武汉市   \n",
       "3                                                     []  22.0       广东省  梅州市   \n",
       "4                                                     []  23.0       广东省  广州市   \n",
       "...                                                  ...   ...       ...  ...   \n",
       "10868                                                 []  29.0       NaN  NaN   \n",
       "10869                                                 []  29.0       NaN  NaN   \n",
       "10870                                                 []  27.0       NaN  NaN   \n",
       "10871                                                 []  28.0       NaN  NaN   \n",
       "10872                                                 []  30.0       NaN  NaN   \n",
       "\n",
       "      region  \n",
       "0        雨湖区  \n",
       "1        开福区  \n",
       "2        洪山区  \n",
       "3        兴宁市  \n",
       "4        白云区  \n",
       "...      ...  \n",
       "10868    NaN  \n",
       "10869    NaN  \n",
       "10870    NaN  \n",
       "10871    NaN  \n",
       "10872    NaN  \n",
       "\n",
       "[8281 rows x 21 columns]"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "politicalStatusDic = {'LEAGUE_MEMBER':'团员', \n",
    "'MASSES':'群众',\n",
    "'PARTY_MEMBER':'中共党员', \n",
    "'PROBATIONARY_PARTY_MEMBER':'中共预备党员'}\n",
    "PeopleDetail['politicalStatus'] = PeopleDetail['politicalStatus'].map(politicalStatusDic)\n",
    "PeopleDetail"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5.期望行业"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "有两条脏数据，处理掉"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "# PeopleDetail[\"expectIndustry\"].value_counts().to_csv(\"temp.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[\"互联网\"]                   8101\n",
       "[\"不限\"]                      77\n",
       "[]                          27\n",
       "[\"互联网\",\"不限\"]                 5\n",
       "[\"互联网\",\"数据服务\"]               5\n",
       "[\"互联网\",\"媒体\"]                 3\n",
       "[\"不限\",\"互联网\",\"数据服务\"]          2\n",
       "[\"不限\",\"互联网\",\"金融\"]            2\n",
       "[\"媒体\"]                       2\n",
       "[\"电子商务\"]                     2\n",
       "[\"金融\"]                       2\n",
       "[\"互联网\",\"游戏\",\"计算机软件\"]         2\n",
       "[\"互联网\",\"数据服务\",\"O2O\"]         2\n",
       "[\"互联网\",\"金融\",\"在线教育\"]          2\n",
       "[\"互联网\",\"金融\",\"电子商务\"]          2\n",
       "[\"金融\",\"互联网\"]                 2\n",
       "[\"互联网\",\"不限\",\"电子商务\"]          1\n",
       "[\"互联网\",\"计算机软件\"]              1\n",
       "[\"互联网\",\"媒体\",\"数据服务\"]          1\n",
       "[\"互联网\",\"医疗健康\",\"通信设备\"]        1\n",
       "[\"互联网\",\"游戏\",\"数据服务\"]          1\n",
       "[\"电子商务\",\"互联网\",\"金融\"]          1\n",
       "[\"互联网\",\"计算机软件\",\"金融\"]         1\n",
       "[\"互联网\",\"数据服务\",\"不限\"]          1\n",
       "[\"金融\",\"互联网\",\"数据服务\"]          1\n",
       "[\"互联网\",\"数据服务\",\"金融\"]          1\n",
       "[\"互联网\",\"游戏\"]                 1\n",
       "[\"互联网\",\"信息安全\",\"数据服务\"]        1\n",
       "['不限']                       1\n",
       "[\"不限\",\"游戏\",\"数据服务\"]           1\n",
       "[\"互联网\",\"数据服务\",\"信息安全\"]        1\n",
       "[\"互联网\",\"游戏\",\"不限\"]            1\n",
       "[\"不限\",\"互联网\"]                 1\n",
       "[\"互联网\",\"游戏\",\"金融\"]            1\n",
       "[\"金融\",\"不限\"]                  1\n",
       "[\"互联网\",\"在线教育\",\"游戏\"]          1\n",
       "[\"互联网\",\"电子商务\",\"游戏\"]          1\n",
       "[\"游戏\",\"互联网\"]                 1\n",
       "[\"计算机软件\",\"互联网\",\"信息安全\"]       1\n",
       "[\"数据服务\"]                     1\n",
       "[\"互联网\",\"信息安全\"]               1\n",
       "[\"金融\",\"电子商务\",\"媒体\"]           1\n",
       "[\"网络设备\"]                     1\n",
       "[\"互联网\",\"电子商务\",\"数据服务\"]        1\n",
       "[\"游戏\"]                       1\n",
       "[\"医疗健康\",\"互联网\"]               1\n",
       "[\"金融\",\"电子商务\"]                1\n",
       "[\"数据服务\",\"通信设备\"]              1\n",
       "[\"互联网\",\"计算机软件\",\"数据服务\"]       1\n",
       "[\"不限\",\"金融\"]                  1\n",
       "[\"互联网\",\"电子商务\",\"金融\"]          1\n",
       "[\"互联网\",\"数据服务\",\"电子商务\"]        1\n",
       "[\"电子商务\",\"游戏\",\"不限\"]           1\n",
       "[\"金融\",\"互联网\",\"电子商务\"]          1\n",
       "['互联网', '信息安全']              1\n",
       "[\"互联网\",\"金融\",\"计算机软件\"]         1\n",
       "[\"互联网\",\"电子商务\"]               1\n",
       "[\"计算机软件\",\"数据服务\",\"不限\"]        1\n",
       "[\"媒体\",\"人力资源服务\",\"游戏\"]         1\n",
       "Name: expectIndustry, dtype: int64"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def f(x):\n",
    "    if isinstance(x,str):\n",
    "        if \"不限\" in x and \"\\\\\" in x:\n",
    "            return str([\"不限\"])\n",
    "        elif \"\\\\\" in x and \"信息安全\" in x:\n",
    "            return str([\"互联网\",\"信息安全\"])\n",
    "        else:\n",
    "            return x\n",
    "    return x\n",
    "PeopleDetail['expectIndustry'] = PeopleDetail['expectIndustry'].map(f)\n",
    "PeopleDetail[\"expectIndustry\"].value_counts()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 6.期望工作性质,willNature"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1.0去官方查看这个字段，发现是空缺值，填充成全职"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2.0    184\n",
      "0.0     19\n",
      "1.0     13\n",
      "Name: willNature, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "print(PeopleDetail['willNature'].value_counts())\n",
    "willNatureDic = {2.0:'全职',0.0:\"实习\",1.0:\"全职\"}\n",
    "PeopleDetail['willNature'] = PeopleDetail['willNature'].map(willNatureDic)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 7.educationExperienceList处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "# PeopleDetail['educationExperienceList'].value_counts().to_csv(\"./temp.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0        ['湖南科技大学|数据科学与大数据技术|本科|2019-09-01|2023-06-17']\n",
       "1              ['湖南信息学院|网络工程|本科|2017-09-01|2022-06-28']\n",
       "2                                                    []\n",
       "3         ['广州大学|数学与应用数学精算专业|本科|2019-09-01|2023-07-23']\n",
       "4               ['韩山师范学院|统计学|本科|2018-09-15|2022-06-27']\n",
       "                              ...                      \n",
       "10868                                                []\n",
       "10869                                                []\n",
       "10870                                                []\n",
       "10871                                                []\n",
       "10872                                                []\n",
       "Name: educationExperienceList, Length: 8281, dtype: object"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "PeopleDetail['educationExperienceList']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "def f(x):\n",
    "    if \",\" in x:\n",
    "        x = x.replace(\",\",\"|\")\n",
    "    return x\n",
    "PeopleDetail['educationExperienceList'] = PeopleDetail['educationExperienceList'].map(f)\n",
    "split_df = PeopleDetail['educationExperienceList'].str.split('|', expand=True)\n",
    "split_df = split_df.rename(columns={0: '学校1', 1: '专业1', 2:'学位1',3: '起始时间1', 4: '毕业时间1',\n",
    "                                    5: '学校2', 6: '专业2', 7:'学位2',8:'起始时间2', 9: '毕业时间2'})\n",
    "PeopleDetail = pd.concat([PeopleDetail, split_df], axis=1)\n",
    "del PeopleDetail['educationExperienceList']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 去除杂乱字符\n",
    "def f(x):\n",
    "    if x:\n",
    "        x = x.replace(\"\\\"\",\"\")\n",
    "        x = x.replace(\"[\",\"\")\n",
    "        x = x.replace(\"]\",\"\")\n",
    "        x = x.replace(\"\\'\",\"\")\n",
    "    return x\n",
    "\n",
    "\n",
    "PeopleDetail['学校1'] = PeopleDetail['学校1'].map(f)\n",
    "PeopleDetail['学校2'] = PeopleDetail['学校2'].map(f)\n",
    "PeopleDetail['毕业时间1'] = PeopleDetail['毕业时间1'].map(f)\n",
    "PeopleDetail['毕业时间2'] = PeopleDetail['毕业时间2'].map(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>resumeId</th>\n",
       "      <th>resumeName</th>\n",
       "      <th>birthday</th>\n",
       "      <th>address</th>\n",
       "      <th>arrivalTime</th>\n",
       "      <th>politicalStatus</th>\n",
       "      <th>selfEvaluation</th>\n",
       "      <th>expectIndustry</th>\n",
       "      <th>willNature</th>\n",
       "      <th>keywordList</th>\n",
       "      <th>...</th>\n",
       "      <th>学校1</th>\n",
       "      <th>专业1</th>\n",
       "      <th>学位1</th>\n",
       "      <th>起始时间1</th>\n",
       "      <th>毕业时间1</th>\n",
       "      <th>学校2</th>\n",
       "      <th>专业2</th>\n",
       "      <th>学位2</th>\n",
       "      <th>起始时间2</th>\n",
       "      <th>毕业时间2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1573938917198135296</td>\n",
       "      <td>湖南科技大学-大数据-李敏佳</td>\n",
       "      <td>2001-01-14 00:00:00.0</td>\n",
       "      <td>[\"湖南省\",\"湘潭市\",\"雨湖区\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>LEAGUE_MEMBER</td>\n",
       "      <td>掌握Java、Python的基础语法，以及常用的数据结构与算法\\n掌握Hadoop、Hive...</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>2.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td>湖南科技大学</td>\n",
       "      <td>数据科学与大数据技术</td>\n",
       "      <td>本科</td>\n",
       "      <td>2019-09-01</td>\n",
       "      <td>2023-06-17</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1541793867911790592</td>\n",
       "      <td>前端开发-殷浩</td>\n",
       "      <td>1999-03-07 00:00:00.0</td>\n",
       "      <td>[\"湖南省\",\"长沙市\",\"开福区\"]</td>\n",
       "      <td>2周后到岗</td>\n",
       "      <td>LEAGUE_MEMBER</td>\n",
       "      <td>具有较强的逻辑分析和再学习能力，对前端领域有所认知，积极进取，勇于挑战，为人热情，工作勤奋刻...</td>\n",
       "      <td>[\"不限\"]</td>\n",
       "      <td>2.0</td>\n",
       "      <td>['前端开发']</td>\n",
       "      <td>...</td>\n",
       "      <td>湖南信息学院</td>\n",
       "      <td>网络工程</td>\n",
       "      <td>本科</td>\n",
       "      <td>2017-09-01</td>\n",
       "      <td>2022-06-28</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1569514123790778368</td>\n",
       "      <td>简历</td>\n",
       "      <td>2001-04-11 00:00:00.0</td>\n",
       "      <td>[\"湖北省\",\"武汉市\",\"洪山区\"]</td>\n",
       "      <td>时间待议</td>\n",
       "      <td>LEAGUE_MEMBER</td>\n",
       "      <td>1.本人性格稳重随和、谈吐幽默风趣、具有优秀的社交能力和的组织协调能力。\\n2.适应能力强、...</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1550096565136392192</td>\n",
       "      <td>蓝梓坚</td>\n",
       "      <td>2001-01-04 00:00:00.0</td>\n",
       "      <td>[\"广东省\",\"梅州市\",\"兴宁市\"]</td>\n",
       "      <td>时间待议</td>\n",
       "      <td>LEAGUE_MEMBER</td>\n",
       "      <td>本人有较强的组织协调能力、活动策划能力和公关能力;具有良好的团队精神，善于与人沟通和协作;社...</td>\n",
       "      <td>[\"不限\"]</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td>广州大学</td>\n",
       "      <td>数学与应用数学精算专业</td>\n",
       "      <td>本科</td>\n",
       "      <td>2019-09-01</td>\n",
       "      <td>2023-07-23</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1554782002648055808</td>\n",
       "      <td>个人简历</td>\n",
       "      <td>1999-08-05 00:00:00.0</td>\n",
       "      <td>[\"广东省\",\"广州市\",\"白云区\"]</td>\n",
       "      <td>1周后到岗</td>\n",
       "      <td>PARTY_MEMBER</td>\n",
       "      <td>1、性格开朗外向，珍惜每一次学习机会，有较好的自学能力以及自我管理能力。\\n2、专业水平扎实...</td>\n",
       "      <td>[\"互联网\",\"金融\",\"在线教育\"]</td>\n",
       "      <td>2.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td>韩山师范学院</td>\n",
       "      <td>统计学</td>\n",
       "      <td>本科</td>\n",
       "      <td>2018-09-15</td>\n",
       "      <td>2022-06-27</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10868</th>\n",
       "      <td>7539911466257411604</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1993-12-04 00:00:00.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10869</th>\n",
       "      <td>7539911474847346196</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1993-08-22 00:00:00.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10870</th>\n",
       "      <td>7539911500617149972</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1995-08-08 00:00:00.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10871</th>\n",
       "      <td>7539911483437280788</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1994-11-24 00:00:00.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10872</th>\n",
       "      <td>7539911492027215380</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1993-03-04 00:00:00.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8281 rows × 28 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                  resumeId      resumeName               birthday  \\\n",
       "0      1573938917198135296  湖南科技大学-大数据-李敏佳  2001-01-14 00:00:00.0   \n",
       "1      1541793867911790592         前端开发-殷浩  1999-03-07 00:00:00.0   \n",
       "2      1569514123790778368              简历  2001-04-11 00:00:00.0   \n",
       "3      1550096565136392192             蓝梓坚  2001-01-04 00:00:00.0   \n",
       "4      1554782002648055808            个人简历  1999-08-05 00:00:00.0   \n",
       "...                    ...             ...                    ...   \n",
       "10868  7539911466257411604             NaN  1993-12-04 00:00:00.0   \n",
       "10869  7539911474847346196             NaN  1993-08-22 00:00:00.0   \n",
       "10870  7539911500617149972             NaN  1995-08-08 00:00:00.0   \n",
       "10871  7539911483437280788             NaN  1994-11-24 00:00:00.0   \n",
       "10872  7539911492027215380             NaN  1993-03-04 00:00:00.0   \n",
       "\n",
       "                   address arrivalTime politicalStatus  \\\n",
       "0      [\"湖南省\",\"湘潭市\",\"雨湖区\"]         NaN   LEAGUE_MEMBER   \n",
       "1      [\"湖南省\",\"长沙市\",\"开福区\"]       2周后到岗   LEAGUE_MEMBER   \n",
       "2      [\"湖北省\",\"武汉市\",\"洪山区\"]        时间待议   LEAGUE_MEMBER   \n",
       "3      [\"广东省\",\"梅州市\",\"兴宁市\"]        时间待议   LEAGUE_MEMBER   \n",
       "4      [\"广东省\",\"广州市\",\"白云区\"]       1周后到岗    PARTY_MEMBER   \n",
       "...                    ...         ...             ...   \n",
       "10868                  NaN         NaN             NaN   \n",
       "10869                  NaN         NaN             NaN   \n",
       "10870                  NaN         NaN             NaN   \n",
       "10871                  NaN         NaN             NaN   \n",
       "10872                  NaN         NaN             NaN   \n",
       "\n",
       "                                          selfEvaluation       expectIndustry  \\\n",
       "0      掌握Java、Python的基础语法，以及常用的数据结构与算法\\n掌握Hadoop、Hive...              [\"互联网\"]   \n",
       "1      具有较强的逻辑分析和再学习能力，对前端领域有所认知，积极进取，勇于挑战，为人热情，工作勤奋刻...               [\"不限\"]   \n",
       "2      1.本人性格稳重随和、谈吐幽默风趣、具有优秀的社交能力和的组织协调能力。\\n2.适应能力强、...              [\"互联网\"]   \n",
       "3      本人有较强的组织协调能力、活动策划能力和公关能力;具有良好的团队精神，善于与人沟通和协作;社...               [\"不限\"]   \n",
       "4      1、性格开朗外向，珍惜每一次学习机会，有较好的自学能力以及自我管理能力。\\n2、专业水平扎实...  [\"互联网\",\"金融\",\"在线教育\"]   \n",
       "...                                                  ...                  ...   \n",
       "10868                                                NaN              [\"互联网\"]   \n",
       "10869                                                NaN              [\"互联网\"]   \n",
       "10870                                                NaN              [\"互联网\"]   \n",
       "10871                                                NaN              [\"互联网\"]   \n",
       "10872                                                NaN              [\"互联网\"]   \n",
       "\n",
       "       willNature keywordList  ...     学校1          专业1   学位1       起始时间1  \\\n",
       "0             2.0          []  ...  湖南科技大学   数据科学与大数据技术    本科  2019-09-01   \n",
       "1             2.0    ['前端开发']  ...  湖南信息学院         网络工程    本科  2017-09-01   \n",
       "2             0.0          []  ...                 None  None        None   \n",
       "3             0.0          []  ...    广州大学  数学与应用数学精算专业    本科  2019-09-01   \n",
       "4             2.0          []  ...  韩山师范学院          统计学    本科  2018-09-15   \n",
       "...           ...         ...  ...     ...          ...   ...         ...   \n",
       "10868         NaN          []  ...                 None  None        None   \n",
       "10869         NaN          []  ...                 None  None        None   \n",
       "10870         NaN          []  ...                 None  None        None   \n",
       "10871         NaN          []  ...                 None  None        None   \n",
       "10872         NaN          []  ...                 None  None        None   \n",
       "\n",
       "            毕业时间1   学校2   专业2   学位2 起始时间2 毕业时间2  \n",
       "0      2023-06-17  None  None  None  None  None  \n",
       "1      2022-06-28  None  None  None  None  None  \n",
       "2            None  None  None  None  None  None  \n",
       "3      2023-07-23  None  None  None  None  None  \n",
       "4      2022-06-27  None  None  None  None  None  \n",
       "...           ...   ...   ...   ...   ...   ...  \n",
       "10868        None  None  None  None  None  None  \n",
       "10869        None  None  None  None  None  None  \n",
       "10870        None  None  None  None  None  None  \n",
       "10871        None  None  None  None  None  None  \n",
       "10872        None  None  None  None  None  None  \n",
       "\n",
       "[8281 rows x 28 columns]"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "PeopleDetail"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>resumeId</th>\n",
       "      <th>resumeName</th>\n",
       "      <th>birthday</th>\n",
       "      <th>address</th>\n",
       "      <th>arrivalTime</th>\n",
       "      <th>politicalStatus</th>\n",
       "      <th>selfEvaluation</th>\n",
       "      <th>expectIndustry</th>\n",
       "      <th>willNature</th>\n",
       "      <th>keywordList</th>\n",
       "      <th>...</th>\n",
       "      <th>competitionExperienceList</th>\n",
       "      <th>trainingExperienceList</th>\n",
       "      <th>skillList</th>\n",
       "      <th>languageList</th>\n",
       "      <th>certList</th>\n",
       "      <th>workExperienceList</th>\n",
       "      <th>attachmentList</th>\n",
       "      <th>school</th>\n",
       "      <th>major</th>\n",
       "      <th>degree</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1573938917198135296</td>\n",
       "      <td>湖南科技大学-大数据-李敏佳</td>\n",
       "      <td>2001-01-14 00:00:00.0</td>\n",
       "      <td>[\"湖南省\",\"湘潭市\",\"雨湖区\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>LEAGUE_MEMBER</td>\n",
       "      <td>掌握Java、Python的基础语法，以及常用的数据结构与算法\\n掌握Hadoop、Hive...</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>2.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['英语|GOOD']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['湖南科技大学-数据开发-李敏佳|None|/userywfmzg/16642547805...</td>\n",
       "      <td>湖南科技大学</td>\n",
       "      <td>数据科学与大数据技术</td>\n",
       "      <td>本科</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1541793867911790592</td>\n",
       "      <td>前端开发-殷浩</td>\n",
       "      <td>1999-03-07 00:00:00.0</td>\n",
       "      <td>[\"湖南省\",\"长沙市\",\"开福区\"]</td>\n",
       "      <td>2周后到岗</td>\n",
       "      <td>LEAGUE_MEMBER</td>\n",
       "      <td>具有较强的逻辑分析和再学习能力，对前端领域有所认知，积极进取，勇于挑战，为人热情，工作勤奋刻...</td>\n",
       "      <td>[\"不限\"]</td>\n",
       "      <td>2.0</td>\n",
       "      <td>['前端开发']</td>\n",
       "      <td>...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['英语|COMMONLY']</td>\n",
       "      <td>[]</td>\n",
       "      <td>['None|湖南游掌竟网络科技有限公司|[\"游戏\"]|前端开发实习|2021-07-01|...</td>\n",
       "      <td>[]</td>\n",
       "      <td>湖南信息学院</td>\n",
       "      <td>网络工程</td>\n",
       "      <td>本科</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1569514123790778368</td>\n",
       "      <td>简历</td>\n",
       "      <td>2001-04-11 00:00:00.0</td>\n",
       "      <td>[\"湖北省\",\"武汉市\",\"洪山区\"]</td>\n",
       "      <td>时间待议</td>\n",
       "      <td>LEAGUE_MEMBER</td>\n",
       "      <td>1.本人性格稳重随和、谈吐幽默风趣、具有优秀的社交能力和的组织协调能力。\\n2.适应能力强、...</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td>['其他|“育知联杯”智慧商业大数据创新应用大赛|一等奖|None', '其他|湖北省信创大...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['工信部证书|达梦认证证书|2022-05-26|', '泰迪科技实习证明|泰迪科技实习证...</td>\n",
       "      <td>['None|泰迪智能科技股份有限公司|[\"数据服务\",\"互联网\"]|数据分析师|2022-...</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1550096565136392192</td>\n",
       "      <td>蓝梓坚</td>\n",
       "      <td>2001-01-04 00:00:00.0</td>\n",
       "      <td>[\"广东省\",\"梅州市\",\"兴宁市\"]</td>\n",
       "      <td>时间待议</td>\n",
       "      <td>LEAGUE_MEMBER</td>\n",
       "      <td>本人有较强的组织协调能力、活动策划能力和公关能力;具有良好的团队精神，善于与人沟通和协作;社...</td>\n",
       "      <td>[\"不限\"]</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td>['其他|“互联网+”大学生创业大赛|校级铜奖|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>['计算机二级|SKILLED']</td>\n",
       "      <td>['英语|GOOD']</td>\n",
       "      <td>[]</td>\n",
       "      <td>['None|新东方教育科技有限公司|[\"在线教育\"]|学管师|2021-07-01|202...</td>\n",
       "      <td>[]</td>\n",
       "      <td>广州大学</td>\n",
       "      <td>数学与应用数学精算专业</td>\n",
       "      <td>本科</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1554782002648055808</td>\n",
       "      <td>个人简历</td>\n",
       "      <td>1999-08-05 00:00:00.0</td>\n",
       "      <td>[\"广东省\",\"广州市\",\"白云区\"]</td>\n",
       "      <td>1周后到岗</td>\n",
       "      <td>PARTY_MEMBER</td>\n",
       "      <td>1、性格开朗外向，珍惜每一次学习机会，有较好的自学能力以及自我管理能力。\\n2、专业水平扎实...</td>\n",
       "      <td>[\"互联网\",\"金融\",\"在线教育\"]</td>\n",
       "      <td>2.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td>['泰迪杯|第八届泰迪杯数据挖掘挑战赛|三等奖|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>['python|GOOD']</td>\n",
       "      <td>['英语四级|GOOD']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>韩山师范学院</td>\n",
       "      <td>统计学</td>\n",
       "      <td>本科</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10868</th>\n",
       "      <td>7539911466257411604</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1993-12-04 00:00:00.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['CBDA技能证书|大数据分析工程师-基础级|2018-04-13|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10869</th>\n",
       "      <td>7539911474847346196</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1993-08-22 00:00:00.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10870</th>\n",
       "      <td>7539911500617149972</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1995-08-08 00:00:00.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['CBDA技能证书|大数据分析工程师-基础级|2018-04-13|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10871</th>\n",
       "      <td>7539911483437280788</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1994-11-24 00:00:00.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10872</th>\n",
       "      <td>7539911492027215380</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1993-03-04 00:00:00.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8281 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                  resumeId      resumeName               birthday  \\\n",
       "0      1573938917198135296  湖南科技大学-大数据-李敏佳  2001-01-14 00:00:00.0   \n",
       "1      1541793867911790592         前端开发-殷浩  1999-03-07 00:00:00.0   \n",
       "2      1569514123790778368              简历  2001-04-11 00:00:00.0   \n",
       "3      1550096565136392192             蓝梓坚  2001-01-04 00:00:00.0   \n",
       "4      1554782002648055808            个人简历  1999-08-05 00:00:00.0   \n",
       "...                    ...             ...                    ...   \n",
       "10868  7539911466257411604             NaN  1993-12-04 00:00:00.0   \n",
       "10869  7539911474847346196             NaN  1993-08-22 00:00:00.0   \n",
       "10870  7539911500617149972             NaN  1995-08-08 00:00:00.0   \n",
       "10871  7539911483437280788             NaN  1994-11-24 00:00:00.0   \n",
       "10872  7539911492027215380             NaN  1993-03-04 00:00:00.0   \n",
       "\n",
       "                   address arrivalTime politicalStatus  \\\n",
       "0      [\"湖南省\",\"湘潭市\",\"雨湖区\"]         NaN   LEAGUE_MEMBER   \n",
       "1      [\"湖南省\",\"长沙市\",\"开福区\"]       2周后到岗   LEAGUE_MEMBER   \n",
       "2      [\"湖北省\",\"武汉市\",\"洪山区\"]        时间待议   LEAGUE_MEMBER   \n",
       "3      [\"广东省\",\"梅州市\",\"兴宁市\"]        时间待议   LEAGUE_MEMBER   \n",
       "4      [\"广东省\",\"广州市\",\"白云区\"]       1周后到岗    PARTY_MEMBER   \n",
       "...                    ...         ...             ...   \n",
       "10868                  NaN         NaN             NaN   \n",
       "10869                  NaN         NaN             NaN   \n",
       "10870                  NaN         NaN             NaN   \n",
       "10871                  NaN         NaN             NaN   \n",
       "10872                  NaN         NaN             NaN   \n",
       "\n",
       "                                          selfEvaluation       expectIndustry  \\\n",
       "0      掌握Java、Python的基础语法，以及常用的数据结构与算法\\n掌握Hadoop、Hive...              [\"互联网\"]   \n",
       "1      具有较强的逻辑分析和再学习能力，对前端领域有所认知，积极进取，勇于挑战，为人热情，工作勤奋刻...               [\"不限\"]   \n",
       "2      1.本人性格稳重随和、谈吐幽默风趣、具有优秀的社交能力和的组织协调能力。\\n2.适应能力强、...              [\"互联网\"]   \n",
       "3      本人有较强的组织协调能力、活动策划能力和公关能力;具有良好的团队精神，善于与人沟通和协作;社...               [\"不限\"]   \n",
       "4      1、性格开朗外向，珍惜每一次学习机会，有较好的自学能力以及自我管理能力。\\n2、专业水平扎实...  [\"互联网\",\"金融\",\"在线教育\"]   \n",
       "...                                                  ...                  ...   \n",
       "10868                                                NaN              [\"互联网\"]   \n",
       "10869                                                NaN              [\"互联网\"]   \n",
       "10870                                                NaN              [\"互联网\"]   \n",
       "10871                                                NaN              [\"互联网\"]   \n",
       "10872                                                NaN              [\"互联网\"]   \n",
       "\n",
       "       willNature keywordList  ...  \\\n",
       "0             2.0          []  ...   \n",
       "1             2.0    ['前端开发']  ...   \n",
       "2             0.0          []  ...   \n",
       "3             0.0          []  ...   \n",
       "4             2.0          []  ...   \n",
       "...           ...         ...  ...   \n",
       "10868         NaN          []  ...   \n",
       "10869         NaN          []  ...   \n",
       "10870         NaN          []  ...   \n",
       "10871         NaN          []  ...   \n",
       "10872         NaN          []  ...   \n",
       "\n",
       "                               competitionExperienceList  \\\n",
       "0                                                     []   \n",
       "1                                                     []   \n",
       "2      ['其他|“育知联杯”智慧商业大数据创新应用大赛|一等奖|None', '其他|湖北省信创大...   \n",
       "3                         ['其他|“互联网+”大学生创业大赛|校级铜奖|None']   \n",
       "4                         ['泰迪杯|第八届泰迪杯数据挖掘挑战赛|三等奖|None']   \n",
       "...                                                  ...   \n",
       "10868                                                 []   \n",
       "10869                                                 []   \n",
       "10870                                                 []   \n",
       "10871                                                 []   \n",
       "10872                                                 []   \n",
       "\n",
       "      trainingExperienceList          skillList     languageList  \\\n",
       "0                         []                 []      ['英语|GOOD']   \n",
       "1                         []                 []  ['英语|COMMONLY']   \n",
       "2                         []                 []               []   \n",
       "3                         []  ['计算机二级|SKILLED']      ['英语|GOOD']   \n",
       "4                         []    ['python|GOOD']    ['英语四级|GOOD']   \n",
       "...                      ...                ...              ...   \n",
       "10868                     []                 []               []   \n",
       "10869                     []                 []               []   \n",
       "10870                     []                 []               []   \n",
       "10871                     []                 []               []   \n",
       "10872                     []                 []               []   \n",
       "\n",
       "                                                certList  \\\n",
       "0                                                     []   \n",
       "1                                                     []   \n",
       "2      ['工信部证书|达梦认证证书|2022-05-26|', '泰迪科技实习证明|泰迪科技实习证...   \n",
       "3                                                     []   \n",
       "4                                                     []   \n",
       "...                                                  ...   \n",
       "10868          ['CBDA技能证书|大数据分析工程师-基础级|2018-04-13|None']   \n",
       "10869          ['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']   \n",
       "10870          ['CBDA技能证书|大数据分析工程师-基础级|2018-04-13|None']   \n",
       "10871          ['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']   \n",
       "10872          ['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']   \n",
       "\n",
       "                                      workExperienceList  \\\n",
       "0                                                     []   \n",
       "1      ['None|湖南游掌竟网络科技有限公司|[\"游戏\"]|前端开发实习|2021-07-01|...   \n",
       "2      ['None|泰迪智能科技股份有限公司|[\"数据服务\",\"互联网\"]|数据分析师|2022-...   \n",
       "3      ['None|新东方教育科技有限公司|[\"在线教育\"]|学管师|2021-07-01|202...   \n",
       "4                                                     []   \n",
       "...                                                  ...   \n",
       "10868                                                 []   \n",
       "10869                                                 []   \n",
       "10870                                                 []   \n",
       "10871                                                 []   \n",
       "10872                                                 []   \n",
       "\n",
       "                                          attachmentList  school        major  \\\n",
       "0      ['湖南科技大学-数据开发-李敏佳|None|/userywfmzg/16642547805...  湖南科技大学   数据科学与大数据技术   \n",
       "1                                                     []  湖南信息学院         网络工程   \n",
       "2                                                     []                  NaN   \n",
       "3                                                     []    广州大学  数学与应用数学精算专业   \n",
       "4                                                     []  韩山师范学院          统计学   \n",
       "...                                                  ...     ...          ...   \n",
       "10868                                                 []                  NaN   \n",
       "10869                                                 []                  NaN   \n",
       "10870                                                 []                  NaN   \n",
       "10871                                                 []                  NaN   \n",
       "10872                                                 []                  NaN   \n",
       "\n",
       "      degree  \n",
       "0         本科  \n",
       "1         本科  \n",
       "2        NaN  \n",
       "3         本科  \n",
       "4         本科  \n",
       "...      ...  \n",
       "10868    NaN  \n",
       "10869    NaN  \n",
       "10870    NaN  \n",
       "10871    NaN  \n",
       "10872    NaN  \n",
       "\n",
       "[8281 rows x 21 columns]"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 将这些数据归结为学校、专业、学位\n",
    "PeopleDetail['school'] = np.nan\n",
    "PeopleDetail['major'] = np.nan\n",
    "PeopleDetail['degree'] = np.nan\n",
    "\n",
    "\n",
    "def f(row):\n",
    "    if isinstance(row['学校2'],str):\n",
    "        row[\"school\"] = row['学校2']\n",
    "    elif isinstance(row['学校1'],str):\n",
    "        row['school'] = row['学校1']\n",
    "    else:\n",
    "        # row['school'] = \"\"\n",
    "        pass\n",
    "    \n",
    "    \n",
    "    if isinstance(row['专业2'],str):\n",
    "        row[\"major\"] = row['专业2']\n",
    "    elif isinstance(row['专业1'],str):\n",
    "        row['major'] = row['专业1']\n",
    "    else:\n",
    "        # row['major'] = \"\"\n",
    "        pass\n",
    "\n",
    "\n",
    "    if isinstance(row['学位2'],str):\n",
    "        row[\"degree\"] = row['学位2']\n",
    "    elif isinstance(row['学位1'],str):\n",
    "        row['degree'] = row['学位1']\n",
    "    else:\n",
    "        # row['degree'] = \"\"\n",
    "        pass\n",
    "        \n",
    "    return row\n",
    "\n",
    "PeopleDetail = PeopleDetail.apply(lambda row:f(row),axis=1)\n",
    "\n",
    "for i in ['学校1', '专业1','学位1', '起始时间1', '毕业时间1', '学校2', '专业2', '学位2', '起始时间2', '毕业时间2']:\n",
    "    del PeopleDetail[i]\n",
    "PeopleDetail\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 8.projectExperienceList,competitionExperienceList,trainingExperienceList,skillList,languageList,certList,workExperienceList\n",
    "\n",
    "\n",
    "这些列有的是列表型数据，用|隔开"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>projectExperienceList</th>\n",
       "      <th>competitionExperienceList</th>\n",
       "      <th>trainingExperienceList</th>\n",
       "      <th>skillList</th>\n",
       "      <th>languageList</th>\n",
       "      <th>certList</th>\n",
       "      <th>workExperienceList</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>['工资管理系统|None|核心成员|2020-11-28|2020-12-10|使用Jav...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['英语|GOOD']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>['校园综合平台小程序|湖南信息学院|前端开发|2021-03-03|2021-05-05|...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['英语|COMMONLY']</td>\n",
       "      <td>[]</td>\n",
       "      <td>['None|湖南游掌竟网络科技有限公司|[\"游戏\"]|前端开发实习|2021-07-01|...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[]</td>\n",
       "      <td>['其他|“育知联杯”智慧商业大数据创新应用大赛|一等奖|None', '其他|湖北省信创大...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['工信部证书|达梦认证证书|2022-05-26|', '泰迪科技实习证明|泰迪科技实习证...</td>\n",
       "      <td>['None|泰迪智能科技股份有限公司|[\"数据服务\",\"互联网\"]|数据分析师|2022-...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[]</td>\n",
       "      <td>['其他|“互联网+”大学生创业大赛|校级铜奖|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>['计算机二级|SKILLED']</td>\n",
       "      <td>['英语|GOOD']</td>\n",
       "      <td>[]</td>\n",
       "      <td>['None|新东方教育科技有限公司|[\"在线教育\"]|学管师|2021-07-01|202...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>['航空公司客户价值分析|None|数据分析师|2021-11-18|2021-12-01|...</td>\n",
       "      <td>['泰迪杯|第八届泰迪杯数据挖掘挑战赛|三等奖|None']</td>\n",
       "      <td>[]</td>\n",
       "      <td>['python|GOOD']</td>\n",
       "      <td>['英语四级|GOOD']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10868</th>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['CBDA技能证书|大数据分析工程师-基础级|2018-04-13|None']</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10869</th>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10870</th>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['CBDA技能证书|大数据分析工程师-基础级|2018-04-13|None']</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10871</th>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10872</th>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8281 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                   projectExperienceList  \\\n",
       "0      ['工资管理系统|None|核心成员|2020-11-28|2020-12-10|使用Jav...   \n",
       "1      ['校园综合平台小程序|湖南信息学院|前端开发|2021-03-03|2021-05-05|...   \n",
       "2                                                     []   \n",
       "3                                                     []   \n",
       "4      ['航空公司客户价值分析|None|数据分析师|2021-11-18|2021-12-01|...   \n",
       "...                                                  ...   \n",
       "10868                                                 []   \n",
       "10869                                                 []   \n",
       "10870                                                 []   \n",
       "10871                                                 []   \n",
       "10872                                                 []   \n",
       "\n",
       "                               competitionExperienceList  \\\n",
       "0                                                     []   \n",
       "1                                                     []   \n",
       "2      ['其他|“育知联杯”智慧商业大数据创新应用大赛|一等奖|None', '其他|湖北省信创大...   \n",
       "3                         ['其他|“互联网+”大学生创业大赛|校级铜奖|None']   \n",
       "4                         ['泰迪杯|第八届泰迪杯数据挖掘挑战赛|三等奖|None']   \n",
       "...                                                  ...   \n",
       "10868                                                 []   \n",
       "10869                                                 []   \n",
       "10870                                                 []   \n",
       "10871                                                 []   \n",
       "10872                                                 []   \n",
       "\n",
       "      trainingExperienceList          skillList     languageList  \\\n",
       "0                         []                 []      ['英语|GOOD']   \n",
       "1                         []                 []  ['英语|COMMONLY']   \n",
       "2                         []                 []               []   \n",
       "3                         []  ['计算机二级|SKILLED']      ['英语|GOOD']   \n",
       "4                         []    ['python|GOOD']    ['英语四级|GOOD']   \n",
       "...                      ...                ...              ...   \n",
       "10868                     []                 []               []   \n",
       "10869                     []                 []               []   \n",
       "10870                     []                 []               []   \n",
       "10871                     []                 []               []   \n",
       "10872                     []                 []               []   \n",
       "\n",
       "                                                certList  \\\n",
       "0                                                     []   \n",
       "1                                                     []   \n",
       "2      ['工信部证书|达梦认证证书|2022-05-26|', '泰迪科技实习证明|泰迪科技实习证...   \n",
       "3                                                     []   \n",
       "4                                                     []   \n",
       "...                                                  ...   \n",
       "10868          ['CBDA技能证书|大数据分析工程师-基础级|2018-04-13|None']   \n",
       "10869          ['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']   \n",
       "10870          ['CBDA技能证书|大数据分析工程师-基础级|2018-04-13|None']   \n",
       "10871          ['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']   \n",
       "10872          ['CBDA技能证书|大数据开发工程师-基础级|2018-04-13|None']   \n",
       "\n",
       "                                      workExperienceList  \n",
       "0                                                     []  \n",
       "1      ['None|湖南游掌竟网络科技有限公司|[\"游戏\"]|前端开发实习|2021-07-01|...  \n",
       "2      ['None|泰迪智能科技股份有限公司|[\"数据服务\",\"互联网\"]|数据分析师|2022-...  \n",
       "3      ['None|新东方教育科技有限公司|[\"在线教育\"]|学管师|2021-07-01|202...  \n",
       "4                                                     []  \n",
       "...                                                  ...  \n",
       "10868                                                 []  \n",
       "10869                                                 []  \n",
       "10870                                                 []  \n",
       "10871                                                 []  \n",
       "10872                                                 []  \n",
       "\n",
       "[8281 rows x 7 columns]"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "PeopleDetail[['projectExperienceList','competitionExperienceList',\"trainingExperienceList\",\"skillList\",\"languageList\",\"certList\",\"workExperienceList\"]]"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "对于competitionExperienceList字段而言，我们我们提取出竞赛名称和奖项"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[]                                                                                                                                                      8198\n",
      "['泰迪杯|第八届泰迪杯数据挖掘挑战赛|三等奖|None']                                                                                                                             4\n",
      "['泰迪杯|第八届泰迪杯数据挖掘挑战赛|成功参赛|None']                                                                                                                            4\n",
      "['泰迪杯|第八届泰迪杯数据挖掘挑战赛|二等奖|None']                                                                                                                             3\n",
      "['泰迪杯|第九届泰迪杯数据挖掘挑战赛|三等奖|None', '泰迪杯|第四届泰迪杯数据分析技能赛|二等奖|None', '其他|全国大学生数学建模竞赛|国家二等奖|None', '其他|互联网+创新创业大赛|校级金奖|None']                                       2\n",
      "                                                                                                                                                        ... \n",
      "['泰迪杯|第八届泰迪杯数据挖掘挑战赛|三等奖|/userfjinhc/118887-jpg/泰迪杯.jpg', '其他|数学建模2021|省级二等奖|None']                                                                         1\n",
      "['其他|2020年大学生网络安全知识竞赛|团体三等奖|/userfyja0w/150654-jpg/网络知识竞赛.jpg', '其他|大学生创新创业大赛|优秀奖|/userfyja0w/83295-jpg/创新创业.jpg']                                         1\n",
      "['其他|韩山师范学院校合唱比赛|三等奖|None', '其他|数学与统计学院课前十分钟|优秀演讲者奖|None', '其他|大学生网络安全知识竞赛总决赛|二等奖|None', '其他|第九届“泰迪杯”数据挖掘挑战赛|省优胜奖|None', '其他|全国大学生数学建模竞赛|省二等奖|None']       1\n",
      "['其他|“育知联杯”智慧商业大数据创新应用大赛|一等奖|None', '其他|湖北省信创大赛—“达梦杯”|一等奖|None', '其他|技能竞赛“大数据分析赛项”|二等奖|None']                                                              1\n",
      "['其他|全国大学生数学建模竞赛|省二等奖|None']                                                                                                                               1\n",
      "Name: competitionExperienceList, Length: 73, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "print(PeopleDetail['competitionExperienceList'].value_counts())\n",
    "def f(x):\n",
    "    if x != \"[]\":\n",
    "        res = []\n",
    "        x = x.split(\",\")\n",
    "        for i in range(len(x)):\n",
    "            temp = x[i].split(\"|\")\n",
    "            res.append(temp[1]+\":\"+temp[2])\n",
    "        return str(res)\n",
    "    \n",
    "    else:\n",
    "        return x      \n",
    "PeopleDetail['competitionExperienceList'] = PeopleDetail['competitionExperienceList'].map(f)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "去除这些字段的一些乱七八糟的字符"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "def f(x):\n",
    "    if x:\n",
    "        if \",\" in x:\n",
    "            x = x.replace(\",\",\"|\")\n",
    "        x = x.replace(\"\\\"\",\"\")\n",
    "        x = x.replace(\"[\",\"\")\n",
    "        x = x.replace(\"]\",\"\")\n",
    "        x = x.replace(\"\\'\",\"\")\n",
    "        return x\n",
    "for i in [\"expectIndustry\",'keywordList','projectExperienceList','competitionExperienceList','trainingExperienceList','skillList','languageList','certList','workExperienceList']:\n",
    "    PeopleDetail[i] = PeopleDetail[i].map(f)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "skillList进行处理，因为技能是有等级的。因此，提取出等级来"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                                                                                                                                         7686\n",
      "Python|GOOD                                                                                                                                                               201\n",
      "Python|GOOD| NumPy|GOOD| Matplotlib|GOOD| pandas|GOOD| scikit-learn|GOOD                                                                                                  173\n",
      "Python|GOOD| urllib3库|GOOD| Requests库|GOOD| Xpath|GOOD| Beautiful Soup|GOOD| Scrapy|GOOD| MySQL|GOOD| MongoDB|GOOD                                                         88\n",
      "Hadoop|GOOD| HDFS|GOOD| MapReduce|GOOD| Java|GOOD                                                                                                                          21\n",
      "                                                                                                                                                                         ... \n",
      "Python|GOOD| MySQL|GOOD| SPSS|GOOD| Excel|GOOD                                                                                                                              1\n",
      "Python|SKILLED| MATLAB|SKILLED| JavaScript|GOOD| C++|GOOD| SPSS|GOOD                                                                                                        1\n",
      "Excel|SKILLED| Python|GOOD                                                                                                                                                  1\n",
      "HTML|GOOD| JavaScript|COMMONLY| Java|COMMONLY| css|GOOD                                                                                                                     1\n",
      "Python|GOOD| Hadoop|GOOD| urllib3库|GOOD| HDFS|GOOD| Requests库|GOOD| MapReduce|GOOD| Xpath|GOOD| Java|GOOD| Beautiful Soup|GOOD| Scrapy|GOOD| MySQL|GOOD| MongoDB|GOOD       1\n",
      "Name: skillList, Length: 100, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "# 对skillList进行处理\n",
    "\n",
    "print(PeopleDetail['skillList'].value_counts())\n",
    "def f(x):\n",
    "    x = x.replace(\"|GOOD\",\"良好\")\n",
    "    x = x.replace(\"|SKILLED\",\"熟练\")\n",
    "    x = x.replace(\"|COMMONLY\",\"一般\")\n",
    "    x = x.replace(\"|MASTER\",\"精通\")\n",
    "    return x\n",
    "PeopleDetail['skillList'] = PeopleDetail['skillList'].map(f)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "languageList这一列鱼龙混杂，写什么的都有，数据量很少，删掉"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "# PeopleDetail['languageList'].value_counts().to_csv(\"./temp.csv\")\n",
    "del PeopleDetail['languageList']"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "trainingExperienceList进行处理\n",
    "\n",
    "去除日期"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "# PeopleDetail['trainingExperienceList'].value_counts().to_csv(\"./temp.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "def f(x):\n",
    "    import re\n",
    "    \n",
    "    x = re.sub(r'\\|\\d{4}-\\d{2}-\\d{2}\\|\\d{4}-\\d{2}-\\d{2}\\|', '|', x)\n",
    "    x = re.sub(r'None\\|', '', x)\n",
    "    x = re.sub(r'\\d{4}-\\d{2}-\\d{2}\\|', '', x)\n",
    "    return x\n",
    "PeopleDetail['trainingExperienceList'] = PeopleDetail['trainingExperienceList'].map(f)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "workExperienceList进行处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "PeopleDetail[\"workExperienceList\"].value_counts().to_csv(\"./temp.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "def f(x):\n",
    "    import re\n",
    "    x = re.sub(r'\\|\\d{4}-\\d{2}-\\d{2}\\|\\d{4}-\\d{2}-\\d{2}\\|', '|', x)\n",
    "    x = re.sub(r'None\\|', '', x)\n",
    "    x = re.sub(r'\\d{4}-\\d{2}-\\d{2}\\|', '', x)\n",
    "    return x\n",
    "PeopleDetail[\"workExperienceList\"] = PeopleDetail[\"workExperienceList\"].map(f)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "projectExperienceList进行处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "def f(x):\n",
    "    import re\n",
    "    \n",
    "    x = re.sub(r'\\|\\d{4}-\\d{2}-\\d{2}\\|\\d{4}-\\d{2}-\\d{2}\\|', '|', x)\n",
    "    x = re.sub(r'None\\|', '', x)\n",
    "    x = re.sub(r'\\d{4}-\\d{2}-\\d{2}\\|', '', x)\n",
    "    return x\n",
    "PeopleDetail[\"projectExperienceList\"] = PeopleDetail[\"projectExperienceList\"].map(f)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 9.attachmentList\n",
    "\n",
    "附件列表删除"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "del PeopleDetail['attachmentList']"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 10.PeopleDetail中的地区是所在地，不需要,删除掉"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "del PeopleDetail['province']\n",
    "del PeopleDetail['city']\n",
    "del PeopleDetail['region'] \t"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 次数构造"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "def CountSplit1(x):\n",
    "    if isinstance(x,str):\n",
    "        x = x.split(\"| \")\n",
    "        return len(x)\n",
    "    return 0\n",
    "\n",
    "def CountSplit2(x):\n",
    "    if isinstance(x,str):\n",
    "        x = x.split(\" \")\n",
    "        return len(x)\n",
    "    return 0\n",
    "\n",
    "PeopleDetail['certListCount'] = PeopleDetail['certList'].map(CountSplit2)\n",
    "PeopleDetail['competitionExperienceListCount'] = PeopleDetail['competitionExperienceList'].map(CountSplit1)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 导出数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "People.to_csv(\"../Data/ProcessData/People.csv\",index=None,encoding=\"utf-8\")\n",
    "PeopleDetail.to_csv(\"../Data/ProcessData/PeopleDetail.csv\",index=None,encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.1"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
